Part 1. Explore
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.1 ✔ dplyr 0.8.0.1
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
1. Data Visualization with ggplot2
library(tidyverse)
mpg
## # A tibble: 234 x 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto… f 18 29 p comp…
## 2 audi a4 1.8 1999 4 manu… f 21 29 p comp…
## 3 audi a4 2 2008 4 manu… f 20 31 p comp…
## 4 audi a4 2 2008 4 auto… f 21 30 p comp…
## 5 audi a4 2.8 1999 6 auto… f 16 26 p comp…
## 6 audi a4 2.8 1999 6 manu… f 18 26 p comp…
## 7 audi a4 3.1 2008 6 auto… f 18 27 p comp…
## 8 audi a4 q… 1.8 1999 4 manu… 4 18 26 p comp…
## 9 audi a4 q… 1.8 1999 4 auto… 4 16 25 p comp…
## 10 audi a4 q… 2 2008 4 manu… 4 20 28 p comp…
## # … with 224 more rows
ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy))

ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy, color = class))

ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy, size = class))
## Warning: Using size for a discrete variable is not advised.

ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy, alpha = class))
## Warning: Using alpha for a discrete variable is not advised.

ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy, shape = class))
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have 7.
## Consider specifying shapes manually if you must have them.
## Warning: Removed 62 rows containing missing values (geom_point).

ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy), color = "blue")

ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy)) + facet_wrap( ~ class, nrow = 2)

ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy)) + facet_grid(drv ~ cyl)

ggplot(data = mpg) + geom_smooth(mapping = aes(x = displ, y = hwy))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = mpg) + geom_smooth(mapping = aes(x = displ, y = hwy, linetype = drv))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy, color = drv)) + geom_smooth(mapping = aes(x = displ, y = hwy, linetype = drv, color = drv))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = mpg) + geom_smooth(mapping = aes(displ, y = hwy))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = mpg) + geom_smooth(mapping = aes(x = displ, y = hwy, group = drv))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = mpg) + geom_smooth(mapping = aes(x = displ, y = hwy, color = drv), show.legend = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy)) + geom_smooth(mapping = aes(x = displ, y = hwy))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + geom_point(mapping = aes(color = class)) + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + geom_point(mapping = aes(color = class)) + geom_smooth(data = filter(mpg, class == "subcompact"), se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut))

ggplot(data = diamonds) + stat_count(mapping = aes(x = cut))

demo <- tribble(~a, ~b,
"bar_1", 20,
"bar_2", 30,
"bar_3", 40)
ggplot(data = demo) + geom_bar(mapping = aes(x = a, y = b), stat = 'identity')

ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut, y = ..prop.., group = 1))

ggplot(data = diamonds) + stat_summary(mapping = aes(x = cut, y = depth),
fun.ymin = min, fun.ymax = max,
fun.y = median)

ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut, color = cut))

ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut, fill = cut))

ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut, fill = clarity))

ggplot(data = diamonds, mapping = aes(x = cut, fill = clarity)) +
geom_bar(alpha = 1/5, position = "identity")

ggplot(data = diamonds, mapping = aes(x = cut, color = clarity)) +
geom_bar(fill = NA, position = "identity")

ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut, fill = clarity), position = "fill")

ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut, fill = clarity), position = "dodge")

ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy), position = "jitter")

ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + geom_boxplot()

ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + geom_boxplot() + coord_flip()

nz <- map_data("nz")
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
ggplot(nz, aes(long, lat, group = group)) +
geom_polygon(fill = "white", color = "black")

ggplot(nz, aes(long, lat, group = group)) +
geom_polygon(fill = "white", color = "black") +
coord_quickmap()

bar <- ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut, fill = cut),
show.legend = FALSE,
width = 1) + theme(aspect.ratio = 1) + labs(x = NULL, y = NULL)
bar + coord_flip()

bar + coord_polar()

2. Workflow: Basic
seq(1,10)
## [1] 1 2 3 4 5 6 7 8 9 10
(y <- seq(1,10,length.out = 5))
## [1] 1.00 3.25 5.50 7.75 10.00
3. Data transformation with dplyr
library(nycflights13)
library(tidyverse)
flights
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # … with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
class(flights)
## [1] "tbl_df" "tbl" "data.frame"
filter(flights, month == 1, day == 1)
## # A tibble: 842 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # … with 832 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
jan1 <- filter(flights, month == 1, day == 1)
(dec25 <- filter(flights, month == 12, day == 25))
## # A tibble: 719 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 12 25 456 500 -4 649
## 2 2013 12 25 524 515 9 805
## 3 2013 12 25 542 540 2 832
## 4 2013 12 25 546 550 -4 1022
## 5 2013 12 25 556 600 -4 730
## 6 2013 12 25 557 600 -3 743
## 7 2013 12 25 557 600 -3 818
## 8 2013 12 25 559 600 -1 855
## 9 2013 12 25 559 600 -1 849
## 10 2013 12 25 600 600 0 850
## # … with 709 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, month == 11 | month == 12)
## # A tibble: 55,403 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 11 1 5 2359 6 352
## 2 2013 11 1 35 2250 105 123
## 3 2013 11 1 455 500 -5 641
## 4 2013 11 1 539 545 -6 856
## 5 2013 11 1 542 545 -3 831
## 6 2013 11 1 549 600 -11 912
## 7 2013 11 1 550 600 -10 705
## 8 2013 11 1 554 600 -6 659
## 9 2013 11 1 554 600 -6 826
## 10 2013 11 1 554 600 -6 749
## # … with 55,393 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
nov_dec <- filter(flights, month %in% c(11,12))
filter(flights, !(arr_delay < 120 | dep_delay > 120))
## # A tibble: 1,783 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 811 630 101 1047
## 2 2013 1 1 1505 1310 115 1638
## 3 2013 1 1 1525 1340 105 1831
## 4 2013 1 1 1549 1445 64 1912
## 5 2013 1 1 1558 1359 119 1718
## 6 2013 1 1 1732 1630 62 2028
## 7 2013 1 1 1803 1620 103 2008
## 8 2013 1 1 2119 1930 109 2358
## 9 2013 1 2 817 630 107 1107
## 10 2013 1 2 1710 1526 104 1857
## # … with 1,773 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, arr_delay <= 120, dep_delay <= 120)
## # A tibble: 316,050 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # … with 316,040 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
x = NA
is.na(x)
## [1] TRUE
# filter only includes rows where the condition is TRUE, it excludes both FALSE and NA values.
df <- tibble(x = c(1,NA,3))
filter(df, x > 1)
## # A tibble: 1 x 1
## x
## <dbl>
## 1 3
filter(df, is.na(x) | x > 1)
## # A tibble: 2 x 1
## x
## <dbl>
## 1 NA
## 2 3
arrange(flights, year, month, day)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # … with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
arrange(flights, desc(arr_delay))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 9 641 900 1301 1242
## 2 2013 6 15 1432 1935 1137 1607
## 3 2013 1 10 1121 1635 1126 1239
## 4 2013 9 20 1139 1845 1014 1457
## 5 2013 7 22 845 1600 1005 1044
## 6 2013 4 10 1100 1900 960 1342
## 7 2013 3 17 2321 810 911 135
## 8 2013 7 22 2257 759 898 121
## 9 2013 12 5 756 1700 896 1058
## 10 2013 5 3 1133 2055 878 1250
## # … with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
# missing values are always sorted at the end
df <- tibble(x = c(5,2,NA))
arrange(df, x)
## # A tibble: 3 x 1
## x
## <dbl>
## 1 2
## 2 5
## 3 NA
select(flights, year, month, day)
## # A tibble: 336,776 x 3
## year month day
## <int> <int> <int>
## 1 2013 1 1
## 2 2013 1 1
## 3 2013 1 1
## 4 2013 1 1
## 5 2013 1 1
## 6 2013 1 1
## 7 2013 1 1
## 8 2013 1 1
## 9 2013 1 1
## 10 2013 1 1
## # … with 336,766 more rows
select(flights, year:day)
## # A tibble: 336,776 x 3
## year month day
## <int> <int> <int>
## 1 2013 1 1
## 2 2013 1 1
## 3 2013 1 1
## 4 2013 1 1
## 5 2013 1 1
## 6 2013 1 1
## 7 2013 1 1
## 8 2013 1 1
## 9 2013 1 1
## 10 2013 1 1
## # … with 336,766 more rows
select(flights, -(year:day))
## # A tibble: 336,776 x 16
## dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
## <int> <int> <dbl> <int> <int> <dbl>
## 1 517 515 2 830 819 11
## 2 533 529 4 850 830 20
## 3 542 540 2 923 850 33
## 4 544 545 -1 1004 1022 -18
## 5 554 600 -6 812 837 -25
## 6 554 558 -4 740 728 12
## 7 555 600 -5 913 854 19
## 8 557 600 -3 709 723 -14
## 9 557 600 -3 838 846 -8
## 10 558 600 -2 753 745 8
## # … with 336,766 more rows, and 10 more variables: carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
rename(flights, tail_num = tailnum)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # … with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tail_num <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
select(flights, time_hour, air_time, everything())
## # A tibble: 336,776 x 19
## time_hour air_time year month day dep_time sched_dep_time
## <dttm> <dbl> <int> <int> <int> <int> <int>
## 1 2013-01-01 05:00:00 227 2013 1 1 517 515
## 2 2013-01-01 05:00:00 227 2013 1 1 533 529
## 3 2013-01-01 05:00:00 160 2013 1 1 542 540
## 4 2013-01-01 05:00:00 183 2013 1 1 544 545
## 5 2013-01-01 06:00:00 116 2013 1 1 554 600
## 6 2013-01-01 05:00:00 150 2013 1 1 554 558
## 7 2013-01-01 06:00:00 158 2013 1 1 555 600
## 8 2013-01-01 06:00:00 53 2013 1 1 557 600
## 9 2013-01-01 06:00:00 140 2013 1 1 557 600
## 10 2013-01-01 06:00:00 138 2013 1 1 558 600
## # … with 336,766 more rows, and 12 more variables: dep_delay <dbl>,
## # arr_time <int>, sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, distance <dbl>,
## # hour <dbl>, minute <dbl>
flights_sml <- select(flights,
year:day,
ends_with("delay"),
distance,
air_time)
mutate(flights_sml,
gain = arr_delay - dep_delay,
speed = distance / air_time * 60)
## # A tibble: 336,776 x 9
## year month day dep_delay arr_delay distance air_time gain speed
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 2 11 1400 227 9 370.
## 2 2013 1 1 4 20 1416 227 16 374.
## 3 2013 1 1 2 33 1089 160 31 408.
## 4 2013 1 1 -1 -18 1576 183 -17 517.
## 5 2013 1 1 -6 -25 762 116 -19 394.
## 6 2013 1 1 -4 12 719 150 16 288.
## 7 2013 1 1 -5 19 1065 158 24 404.
## 8 2013 1 1 -3 -14 229 53 -11 259.
## 9 2013 1 1 -3 -8 944 140 -5 405.
## 10 2013 1 1 -2 8 733 138 10 319.
## # … with 336,766 more rows
mutate(flights_sml,
gain = arr_delay - dep_delay,
hours = air_time / 60,
gain_per_hour = gain / hours)
## # A tibble: 336,776 x 10
## year month day dep_delay arr_delay distance air_time gain hours
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 2 11 1400 227 9 3.78
## 2 2013 1 1 4 20 1416 227 16 3.78
## 3 2013 1 1 2 33 1089 160 31 2.67
## 4 2013 1 1 -1 -18 1576 183 -17 3.05
## 5 2013 1 1 -6 -25 762 116 -19 1.93
## 6 2013 1 1 -4 12 719 150 16 2.5
## 7 2013 1 1 -5 19 1065 158 24 2.63
## 8 2013 1 1 -3 -14 229 53 -11 0.883
## 9 2013 1 1 -3 -8 944 140 -5 2.33
## 10 2013 1 1 -2 8 733 138 10 2.3
## # … with 336,766 more rows, and 1 more variable: gain_per_hour <dbl>
transmute(flights,
gain = arr_delay - dep_delay,
hours = air_time / 60,
gain_per_hour = gain / hours)
## # A tibble: 336,776 x 3
## gain hours gain_per_hour
## <dbl> <dbl> <dbl>
## 1 9 3.78 2.38
## 2 16 3.78 4.23
## 3 31 2.67 11.6
## 4 -17 3.05 -5.57
## 5 -19 1.93 -9.83
## 6 16 2.5 6.4
## 7 24 2.63 9.11
## 8 -11 0.883 -12.5
## 9 -5 2.33 -2.14
## 10 10 2.3 4.35
## # … with 336,766 more rows
transmute(flights,
dep_time,
hour = dep_time %/% 100,
minute = dep_time %% 100)
## # A tibble: 336,776 x 3
## dep_time hour minute
## <int> <dbl> <dbl>
## 1 517 5 17
## 2 533 5 33
## 3 542 5 42
## 4 544 5 44
## 5 554 5 54
## 6 554 5 54
## 7 555 5 55
## 8 557 5 57
## 9 557 5 57
## 10 558 5 58
## # … with 336,766 more rows
x <- 1:10
lag(x)
## [1] NA 1 2 3 4 5 6 7 8 9
lead(x)
## [1] 2 3 4 5 6 7 8 9 10 NA
cumsum(x)
## [1] 1 3 6 10 15 21 28 36 45 55
cummean(x)
## [1] 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5
v <- c(1, 2, 2, NA, 3, 4)
min_rank(y)
## [1] 1 2 3 4 5
min_rank(desc(y))
## [1] 5 4 3 2 1
row_number(y)
## [1] 1 2 3 4 5
dense_rank(y)
## [1] 1 2 3 4 5
percent_rank(y)
## [1] 0.00 0.25 0.50 0.75 1.00
cume_dist(y)
## [1] 0.2 0.4 0.6 0.8 1.0
summarize(flights, delay = mean(dep_delay, na.rm = TRUE))
## # A tibble: 1 x 1
## delay
## <dbl>
## 1 12.6
by_day <- group_by(flights, year, month, day)
summarize(by_day, delay = mean(dep_delay, na.rm = TRUE))
## # A tibble: 365 x 4
## # Groups: year, month [12]
## year month day delay
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # … with 355 more rows
by_dest <- group_by(flights, dest)
delay <- summarize(by_dest,
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE))
delay <- filter(delay, count > 20, dest != "HNL")
ggplot(data = delay, mapping = aes(x = dist, y = delay)) +
geom_point(aes(size = count), alpha = 1/3) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

delays <- flights %>%
group_by(dest) %>%
summarize(count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)) %>%
filter(count > 20, dest != "HNL")
?ggvis
## No documentation for 'ggvis' in specified packages and libraries:
## you could try '??ggvis'
flights %>% group_by(year, month, day) %>%
summarize(mean = mean(dep_delay))
## # A tibble: 365 x 4
## # Groups: year, month [12]
## year month day mean
## <int> <int> <int> <dbl>
## 1 2013 1 1 NA
## 2 2013 1 2 NA
## 3 2013 1 3 NA
## 4 2013 1 4 NA
## 5 2013 1 5 NA
## 6 2013 1 6 NA
## 7 2013 1 7 NA
## 8 2013 1 8 NA
## 9 2013 1 9 NA
## 10 2013 1 10 NA
## # … with 355 more rows
flights %>%
group_by(year, month, day) %>%
summarize(mean = mean(dep_delay, na.rm = TRUE))
## # A tibble: 365 x 4
## # Groups: year, month [12]
## year month day mean
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # … with 355 more rows
not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled %>%
group_by(year, month, day) %>%
summarize(mean = mean(dep_delay))
## # A tibble: 365 x 4
## # Groups: year, month [12]
## year month day mean
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.4
## 2 2013 1 2 13.7
## 3 2013 1 3 10.9
## 4 2013 1 4 8.97
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.56
## 9 2013 1 9 2.30
## 10 2013 1 10 2.84
## # … with 355 more rows
delays <- not_cancelled %>%
group_by(tailnum) %>%
summarize(delay = mean(arr_delay))
ggplot(data = delays, mapping = aes(x = delay)) +
geom_freqpoly(binwidth = 10)

delays <- not_cancelled %>%
group_by(tailnum) %>%
summarize(delay = mean(arr_delay, na.rm = TRUE),
n = n())
ggplot(data = delays, mapping = aes(x = n, y = delay)) +
geom_point(alpha = 1/10)

delays %>%
filter(n > 25) %>%
ggplot(mapping = aes(x = n, y = delay)) +
geom_point(alpha = 1/10)

batting <- as_tibble(Lahman::Batting)
batters <- batting %>%
group_by(playerID) %>%
summarize(ba = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE),
ab = sum(AB, na.rm = TRUE))
batters %>%
filter(ab > 100) %>%
ggplot(mapping = aes(x = ab, y = ba)) +
geom_point() +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

batters %>% arrange(desc(ba))
## # A tibble: 18,915 x 3
## playerID ba ab
## <chr> <dbl> <int>
## 1 abramge01 1 1
## 2 banisje01 1 1
## 3 bartocl01 1 1
## 4 bassdo01 1 1
## 5 berrijo01 1 1
## 6 birasst01 1 2
## 7 bruneju01 1 1
## 8 burnscb01 1 1
## 9 cammaer01 1 1
## 10 campsh01 1 1
## # … with 18,905 more rows
not_cancelled %>%
group_by(year, month, day) %>%
summarize(avg_delay1 = mean(arr_delay),
avg_delay2 = mean(arr_delay[arr_delay > 0]))
## # A tibble: 365 x 5
## # Groups: year, month [12]
## year month day avg_delay1 avg_delay2
## <int> <int> <int> <dbl> <dbl>
## 1 2013 1 1 12.7 32.5
## 2 2013 1 2 12.7 32.0
## 3 2013 1 3 5.73 27.7
## 4 2013 1 4 -1.93 28.3
## 5 2013 1 5 -1.53 22.6
## 6 2013 1 6 4.24 24.4
## 7 2013 1 7 -4.95 27.8
## 8 2013 1 8 -3.23 20.8
## 9 2013 1 9 -0.264 25.6
## 10 2013 1 10 -5.90 27.3
## # … with 355 more rows
not_cancelled %>%
group_by(dest) %>%
summarize(distance_sd = sd(distance)) %>%
arrange(desc(distance_sd))
## # A tibble: 104 x 2
## dest distance_sd
## <chr> <dbl>
## 1 EGE 10.5
## 2 SAN 10.4
## 3 SFO 10.2
## 4 HNL 10.0
## 5 SEA 9.98
## 6 LAS 9.91
## 7 PDX 9.87
## 8 PHX 9.86
## 9 LAX 9.66
## 10 IND 9.46
## # … with 94 more rows
not_cancelled %>%
group_by(year, month, day) %>%
summarize(first = min(dep_time),
last = max(dep_time))
## # A tibble: 365 x 5
## # Groups: year, month [12]
## year month day first last
## <int> <int> <int> <dbl> <dbl>
## 1 2013 1 1 517 2356
## 2 2013 1 2 42 2354
## 3 2013 1 3 32 2349
## 4 2013 1 4 25 2358
## 5 2013 1 5 14 2357
## 6 2013 1 6 16 2355
## 7 2013 1 7 49 2359
## 8 2013 1 8 454 2351
## 9 2013 1 9 2 2252
## 10 2013 1 10 3 2320
## # … with 355 more rows
not_cancelled %>%
group_by(year, month, day) %>%
summarize(first_dep = first(dep_time),
last_dep = last(dep_time))
## # A tibble: 365 x 5
## # Groups: year, month [12]
## year month day first_dep last_dep
## <int> <int> <int> <int> <int>
## 1 2013 1 1 517 2356
## 2 2013 1 2 42 2354
## 3 2013 1 3 32 2349
## 4 2013 1 4 25 2358
## 5 2013 1 5 14 2357
## 6 2013 1 6 16 2355
## 7 2013 1 7 49 2359
## 8 2013 1 8 454 2351
## 9 2013 1 9 2 2252
## 10 2013 1 10 3 2320
## # … with 355 more rows
not_cancelled %>%
group_by(year, month, day) %>%
mutate(r = min_rank(desc(dep_time))) %>%
filter(r %in% range(r))
## # A tibble: 770 x 20
## # Groups: year, month, day [365]
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 2356 2359 -3 425
## 3 2013 1 2 42 2359 43 518
## 4 2013 1 2 2354 2359 -5 413
## 5 2013 1 3 32 2359 33 504
## 6 2013 1 3 2349 2359 -10 434
## 7 2013 1 4 25 2359 26 505
## 8 2013 1 4 2358 2359 -1 429
## 9 2013 1 4 2358 2359 -1 436
## 10 2013 1 5 14 2359 15 503
## # … with 760 more rows, and 13 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, r <int>
not_cancelled %>%
group_by(dest) %>%
summarize(carriers = n_distinct(carrier)) %>%
arrange(desc(carriers))
## # A tibble: 104 x 2
## dest carriers
## <chr> <int>
## 1 ATL 7
## 2 BOS 7
## 3 CLT 7
## 4 ORD 7
## 5 TPA 7
## 6 AUS 6
## 7 DCA 6
## 8 DTW 6
## 9 IAD 6
## 10 MSP 6
## # … with 94 more rows
not_cancelled %>%
count(dest)
## # A tibble: 104 x 2
## dest n
## <chr> <int>
## 1 ABQ 254
## 2 ACK 264
## 3 ALB 418
## 4 ANC 8
## 5 ATL 16837
## 6 AUS 2411
## 7 AVL 261
## 8 BDL 412
## 9 BGR 358
## 10 BHM 269
## # … with 94 more rows
not_cancelled %>%
count(tailnum, wt = distance)
## # A tibble: 4,037 x 2
## tailnum n
## <chr> <dbl>
## 1 D942DN 3418
## 2 N0EGMQ 239143
## 3 N10156 109664
## 4 N102UW 25722
## 5 N103US 24619
## 6 N104UW 24616
## 7 N10575 139903
## 8 N105UW 23618
## 9 N107US 21677
## 10 N108UW 32070
## # … with 4,027 more rows
not_cancelled %>%
group_by(year, month, day) %>%
summarize(n_early = sum(dep_time < 500))
## # A tibble: 365 x 4
## # Groups: year, month [12]
## year month day n_early
## <int> <int> <int> <int>
## 1 2013 1 1 0
## 2 2013 1 2 3
## 3 2013 1 3 4
## 4 2013 1 4 3
## 5 2013 1 5 3
## 6 2013 1 6 2
## 7 2013 1 7 2
## 8 2013 1 8 1
## 9 2013 1 9 3
## 10 2013 1 10 3
## # … with 355 more rows
not_cancelled %>%
group_by(year, month, day) %>%
summarize(hour_perc = mean(arr_delay > 60))
## # A tibble: 365 x 4
## # Groups: year, month [12]
## year month day hour_perc
## <int> <int> <int> <dbl>
## 1 2013 1 1 0.0722
## 2 2013 1 2 0.0851
## 3 2013 1 3 0.0567
## 4 2013 1 4 0.0396
## 5 2013 1 5 0.0349
## 6 2013 1 6 0.0470
## 7 2013 1 7 0.0333
## 8 2013 1 8 0.0213
## 9 2013 1 9 0.0202
## 10 2013 1 10 0.0183
## # … with 355 more rows
daily <- group_by(flights, year, month, day)
(per_day <- summarize(daily, flights = n()))
## # A tibble: 365 x 4
## # Groups: year, month [12]
## year month day flights
## <int> <int> <int> <int>
## 1 2013 1 1 842
## 2 2013 1 2 943
## 3 2013 1 3 914
## 4 2013 1 4 915
## 5 2013 1 5 720
## 6 2013 1 6 832
## 7 2013 1 7 933
## 8 2013 1 8 899
## 9 2013 1 9 902
## 10 2013 1 10 932
## # … with 355 more rows
(per_month <- summarize(per_day, flights = sum(flights)))
## # A tibble: 12 x 3
## # Groups: year [1]
## year month flights
## <int> <int> <int>
## 1 2013 1 27004
## 2 2013 2 24951
## 3 2013 3 28834
## 4 2013 4 28330
## 5 2013 5 28796
## 6 2013 6 28243
## 7 2013 7 29425
## 8 2013 8 29327
## 9 2013 9 27574
## 10 2013 10 28889
## 11 2013 11 27268
## 12 2013 12 28135
(per_year <- summarize(per_month, flights = sum(flights)))
## # A tibble: 1 x 2
## year flights
## <int> <int>
## 1 2013 336776
daily %>%
ungroup() %>%
summarize(flights = n())
## # A tibble: 1 x 1
## flights
## <int>
## 1 336776
flights_sml %>%
group_by(year, month, day) %>%
filter(rank(desc(arr_delay))<10)
## # A tibble: 3,306 x 7
## # Groups: year, month, day [365]
## year month day dep_delay arr_delay distance air_time
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 853 851 184 41
## 2 2013 1 1 290 338 1134 213
## 3 2013 1 1 260 263 266 46
## 4 2013 1 1 157 174 213 60
## 5 2013 1 1 216 222 708 121
## 6 2013 1 1 255 250 589 115
## 7 2013 1 1 285 246 1085 146
## 8 2013 1 1 192 191 199 44
## 9 2013 1 1 379 456 1092 222
## 10 2013 1 2 224 207 550 94
## # … with 3,296 more rows
popular_dests <- flights %>%
group_by(dest) %>%
filter(n()>365)
popular_dests
## # A tibble: 332,577 x 19
## # Groups: dest [77]
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # … with 332,567 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
popular_dests %>%
filter(arr_delay > 0) %>%
mutate(prop_delay = arr_delay / sum(arr_delay)) %>%
select(year:day, dest, arr_delay, prop_delay)
## # A tibble: 131,106 x 6
## # Groups: dest [77]
## year month day dest arr_delay prop_delay
## <int> <int> <int> <chr> <dbl> <dbl>
## 1 2013 1 1 IAH 11 0.000111
## 2 2013 1 1 IAH 20 0.000201
## 3 2013 1 1 MIA 33 0.000235
## 4 2013 1 1 ORD 12 0.0000424
## 5 2013 1 1 FLL 19 0.0000938
## 6 2013 1 1 ORD 8 0.0000283
## 7 2013 1 1 LAX 7 0.0000344
## 8 2013 1 1 DFW 31 0.000282
## 9 2013 1 1 ATL 12 0.0000400
## 10 2013 1 1 DTW 16 0.000116
## # … with 131,096 more rows
4. Workflow: Scripts
5. Exploratory data analysts
library(tidyverse)
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut))

diamonds %>% count(cut)
## # A tibble: 5 x 2
## cut n
## <ord> <int>
## 1 Fair 1610
## 2 Good 4906
## 3 Very Good 12082
## 4 Premium 13791
## 5 Ideal 21551
ggplot(data = diamonds) +
geom_histogram(mapping = aes(x = carat), binwidth = 0.5)

diamonds %>%
count(cut_width(carat, 0.5))
## # A tibble: 11 x 2
## `cut_width(carat, 0.5)` n
## <fct> <int>
## 1 [-0.25,0.25] 785
## 2 (0.25,0.75] 29498
## 3 (0.75,1.25] 15977
## 4 (1.25,1.75] 5313
## 5 (1.75,2.25] 2002
## 6 (2.25,2.75] 322
## 7 (2.75,3.25] 32
## 8 (3.25,3.75] 5
## 9 (3.75,4.25] 4
## 10 (4.25,4.75] 1
## 11 (4.75,5.25] 1
smaller <- diamonds %>%
filter(carat < 3)
ggplot(data = smaller, mapping = aes(x = carat)) +
geom_histogram(binwidth = 0.1)

ggplot(data = smaller, mapping = aes(x = carat, color = cut)) +
geom_freqpoly(binwidth = 0.1)

ggplot(data = smaller, mapping = aes(x = carat)) + geom_histogram(binwidth = 0.01)

ggplot(data = faithful, mapping = aes(x = eruptions)) + geom_histogram(binwidth = 0.25)

ggplot(diamonds) +
geom_histogram(mapping = aes(x = y), binwidth = 0.5)

ggplot(diamonds) +
geom_histogram(mapping = aes(x = y), binwidth = 0.5) +
coord_cartesian(ylim = c(0,50))

unusuak <- diamonds %>%
filter(y < 3 | y > 20) %>%
arrange(y)
unusual
## Error in eval(expr, envir, enclos): object 'unusual' not found
diamonds2 <- diamonds %>%
filter(between(y, 3, 20))
diamonds2 <- diamonds %>%
mutate(y = ifelse(y < 3 | y > 20, NA, y))
ggplot(data = diamonds2, mapping = aes(x = x, y = y)) +
geom_point()
## Warning: Removed 9 rows containing missing values (geom_point).

ggplot(data = diamonds2, mapping = aes(x = x, y = y)) +
geom_point(na.rm = TRUE)

nycflights13::flights %>%
mutate(cancelled = is.na(dep_time),
sched_hour = sched_dep_time %/% 100,
sched_min = sched_dep_time %% 100,
sched_dep_time = sched_hour + sched_min / 60) %>%
ggplot(mapping = aes(sched_dep_time)) +
geom_freqpoly(mapping = aes(color = cancelled),
binwidth = 1/4)

ggplot(data = diamonds, mapping = aes(x = price)) +
geom_freqpoly(mapping = aes(color = cut), binwidth = 500)

ggplot(diamonds) +
geom_bar(mapping = aes(x = cut))

ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) +
geom_freqpoly(mapping = aes(color = cut), binwidth = 500)

ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
geom_boxplot()

ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
geom_boxplot()

ggplot(data = mpg) +
geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median),y = hwy))

ggplot(data = mpg) +
geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) +
coord_flip()

ggplot(data = diamonds) +
geom_count(mapping = aes(x = cut, y = color))

diamonds %>%
count(color, cut)
## # A tibble: 35 x 3
## color cut n
## <ord> <ord> <int>
## 1 D Fair 163
## 2 D Good 662
## 3 D Very Good 1513
## 4 D Premium 1603
## 5 D Ideal 2834
## 6 E Fair 224
## 7 E Good 933
## 8 E Very Good 2400
## 9 E Premium 2337
## 10 E Ideal 3903
## # … with 25 more rows
diamonds %>%
count(color, cut) %>%
ggplot(mapping = aes(x = color, y = cut)) +
geom_tile(mapping = aes(fill = n))

ggplot(data = diamonds) +
geom_point(mapping = aes(x = carat, y = price))

ggplot(data = diamonds) +
geom_point(mapping = aes(x = carat, y = price), alpha = 1/100)

ggplot(data = smaller) +
geom_bin2d(mapping = aes(x = carat, y = price))

ggplot(data = smaller) +
geom_hex(mapping = aes(x = carat, y = price))

ggplot(data = smaller, mapping = aes(x = carat, y = price)) +
geom_boxplot(mapping = aes(group = cut_width(carat, 0.1)))

ggplot(data = smaller, mapping = aes(x = carat, y = price)) +
geom_boxplot(mapping = aes(group = cut_number(carat, 20)))

ggplot(data = faithful) +
geom_point(mapping = aes(x = eruptions, y = waiting))

library(modelr)
mod <- lm(log(price) ~ log(carat), data = diamonds)
diamonds2 <- diamonds %>%
add_residuals(mod) %>%
mutate(resid = exp(resid))
ggplot(data = diamonds2) +
geom_point(mapping = aes(x = carat, y = resid))

ggplot(data = diamonds2) +
geom_boxplot(mapping = aes(x = cut, y = resid))

ggplot(data = faithful, mapping = aes(x = eruptions)) +
geom_freqpoly(bindwidth = 0.25)
## Warning: Ignoring unknown parameters: bindwidth
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(faithful, aes(eruptions)) +
geom_freqpoly(binwidth = 0.25)

diamonds %>%
count(cut, clarity) %>%
ggplot(aes(clarity, cut, fill = n)) +
geom_tile()

6. Workflow: Projects
Part 2. Wrangle
7. Tibbles with tibble
library(tidyverse)
as_tibble(iris)
## # A tibble: 150 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## # … with 140 more rows
tibble(
x = 1:5,
y = 1,
z = x ^ 2 + y
)
## # A tibble: 5 x 3
## x y z
## <int> <dbl> <dbl>
## 1 1 1 2
## 2 2 1 5
## 3 3 1 10
## 4 4 1 17
## 5 5 1 26
tb <- tibble(
`:)` = "smile",
` ` = "space",
`2000` = "number"
)
tb
## # A tibble: 1 x 3
## `:)` ` ` `2000`
## <chr> <chr> <chr>
## 1 smile space number
tribble(
~x, ~y, ~z,
"a",2,3.6,
"b",1,8.5
)
## # A tibble: 2 x 3
## x y z
## <chr> <dbl> <dbl>
## 1 a 2 3.6
## 2 b 1 8.5
tibble(
a = lubridate::now() + runif(1e3) * 86400,
b = lubridate::today() + runif(1e3) * 30,
c = 1:1e3,
d = runif(1e3),
e = sample(letters, 1e3, replace = TRUE)
)
## # A tibble: 1,000 x 5
## a b c d e
## <dttm> <date> <int> <dbl> <chr>
## 1 2019-04-14 23:18:23 2019-04-22 1 0.0882 p
## 2 2019-04-14 21:04:16 2019-05-01 2 0.175 a
## 3 2019-04-15 09:50:43 2019-05-09 3 0.945 f
## 4 2019-04-14 23:41:59 2019-04-21 4 0.987 i
## 5 2019-04-15 02:38:47 2019-05-11 5 0.0293 a
## 6 2019-04-15 02:55:08 2019-04-18 6 0.448 s
## 7 2019-04-14 22:41:19 2019-05-02 7 0.257 l
## 8 2019-04-15 03:57:50 2019-05-10 8 0.0421 b
## 9 2019-04-15 01:49:20 2019-04-29 9 0.447 s
## 10 2019-04-15 16:59:57 2019-04-19 10 0.718 j
## # … with 990 more rows
nycflights13::flights %>% print(n = 10, width = Inf)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## sched_arr_time arr_delay carrier flight tailnum origin dest air_time
## <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl>
## 1 819 11 UA 1545 N14228 EWR IAH 227
## 2 830 20 UA 1714 N24211 LGA IAH 227
## 3 850 33 AA 1141 N619AA JFK MIA 160
## 4 1022 -18 B6 725 N804JB JFK BQN 183
## 5 837 -25 DL 461 N668DN LGA ATL 116
## 6 728 12 UA 1696 N39463 EWR ORD 150
## 7 854 19 B6 507 N516JB EWR FLL 158
## 8 723 -14 EV 5708 N829AS LGA IAD 53
## 9 846 -8 B6 79 N593JB JFK MCO 140
## 10 745 8 AA 301 N3ALAA LGA ORD 138
## distance hour minute time_hour
## <dbl> <dbl> <dbl> <dttm>
## 1 1400 5 15 2013-01-01 05:00:00
## 2 1416 5 29 2013-01-01 05:00:00
## 3 1089 5 40 2013-01-01 05:00:00
## 4 1576 5 45 2013-01-01 05:00:00
## 5 762 6 0 2013-01-01 06:00:00
## 6 719 5 58 2013-01-01 05:00:00
## 7 1065 6 0 2013-01-01 06:00:00
## 8 229 6 0 2013-01-01 06:00:00
## 9 944 6 0 2013-01-01 06:00:00
## 10 733 6 0 2013-01-01 06:00:00
## # … with 3.368e+05 more rows
# options(tibble.print_max = n, tibble.print_min = m)
# options(tibble.width = Inf)
df <- tibble(
x = runif(5),
y = rnorm(5)
)
df$x
## [1] 0.289795337 0.005417945 0.704693838 0.091135740 0.569566665
df[["x"]]
## [1] 0.289795337 0.005417945 0.704693838 0.091135740 0.569566665
df["x"]
## # A tibble: 5 x 1
## x
## <dbl>
## 1 0.290
## 2 0.00542
## 3 0.705
## 4 0.0911
## 5 0.570
df[[1]]
## [1] 0.289795337 0.005417945 0.704693838 0.091135740 0.569566665
df[1]
## # A tibble: 5 x 1
## x
## <dbl>
## 1 0.290
## 2 0.00542
## 3 0.705
## 4 0.0911
## 5 0.570
df %>% .$x
## [1] 0.289795337 0.005417945 0.704693838 0.091135740 0.569566665
df %>% .[["x"]]
## [1] 0.289795337 0.005417945 0.704693838 0.091135740 0.569566665
class(as.data.frame(tb))
## [1] "data.frame"
8. Data import with readr
library(tidyverse)
# read_csv(), read_csv2(), read_tsv(), read_delim(), read_fwf(), read_log()
heights <- read_csv("data/heights.csv")
## Error: 'data/heights.csv' does not exist in current working directory ('/Users/stevenli/Desktop/OneDrive - Duke University/BA DS learning/R/R for Data Science').
read_csv("a,b,c
1,2,3
4,5,6")
## # A tibble: 2 x 3
## a b c
## <dbl> <dbl> <dbl>
## 1 1 2 3
## 2 4 5 6
read_csv("The first line of metadata
The second line of metadata
x,y,z
1,2,3", skip = 2)
## # A tibble: 1 x 3
## x y z
## <dbl> <dbl> <dbl>
## 1 1 2 3
read_csv("# A comment I want to skip
x,y,z
1,2,3", comment = "#")
## # A tibble: 1 x 3
## x y z
## <dbl> <dbl> <dbl>
## 1 1 2 3
read_csv("1,2,3\n4,5,6", col_names = FALSE)
## # A tibble: 2 x 3
## X1 X2 X3
## <dbl> <dbl> <dbl>
## 1 1 2 3
## 2 4 5 6
read_csv("1,2,3\n4,5,6", col_names = c("x","y","z"))
## # A tibble: 2 x 3
## x y z
## <dbl> <dbl> <dbl>
## 1 1 2 3
## 2 4 5 6
read_csv("a,b,c\n1,2,.",na = ".")
## # A tibble: 1 x 3
## a b c
## <dbl> <dbl> <lgl>
## 1 1 2 NA
# compared to base R, faster, have progress bar, produce tibbles, don't convert character vectors to facors, use row names, reproducible
str(parse_logical(c("TRUE","FALSE","NA")))
## logi [1:3] TRUE FALSE NA
str(parse_integer(c("1","2","3")))
## int [1:3] 1 2 3
str(parse_date(c("2010-01-01","1979-10-14")))
## Date[1:2], format: "2010-01-01" "1979-10-14"
parse_integer(c("1","231",".","456"), na=".")
## [1] 1 231 NA 456
x <- parse_integer(c("123","345","abc","123.45"))
## Warning: 2 parsing failures.
## row col expected actual
## 3 -- an integer abc
## 4 -- no trailing characters .45
x
## [1] 123 345 NA NA
## attr(,"problems")
## # A tibble: 2 x 4
## row col expected actual
## <int> <int> <chr> <chr>
## 1 3 NA an integer abc
## 2 4 NA no trailing characters .45
problems(x)
## # A tibble: 2 x 4
## row col expected actual
## <int> <int> <chr> <chr>
## 1 3 NA an integer abc
## 2 4 NA no trailing characters .45
parse_double("1.23")
## [1] 1.23
parse_double("1,23", locale = locale(decimal_mark = ","))
## [1] 1.23
parse_number("$100")
## [1] 100
parse_number("20%")
## [1] 20
parse_number("It cost $123.45")
## [1] 123.45
parse_number("$123,456,789")
## [1] 123456789
parse_number("123.456.789", locale = locale(grouping_mark = "."))
## [1] 123456789
parse_number("123`456`789", locale = locale(grouping_mark = "`"))
## [1] 123456789
charToRaw("Hadley")
## [1] 48 61 64 6c 65 79
x1 <- "El Ni\xf1o was particularly bad this year"
x2 <- "\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd"
parse_character(x1, locale = locale(encoding = "Latin1"))
## [1] "El Niño was particularly bad this year"
parse_character(x2, locale = locale(encoding = "Shift-JIS"))
## [1] "こんにちは"
guess_encoding(charToRaw(x1))
## # A tibble: 2 x 2
## encoding confidence
## <chr> <dbl>
## 1 ISO-8859-1 0.46
## 2 ISO-8859-9 0.23
guess_encoding(charToRaw(x2))
## # A tibble: 1 x 2
## encoding confidence
## <chr> <dbl>
## 1 KOI8-R 0.42
fruit <- c("apple", "banana")
parse_factor(c("apple","banana","bananana"), levels = fruit)
## Warning: 1 parsing failure.
## row col expected actual
## 3 -- value in level set bananana
## [1] apple banana <NA>
## attr(,"problems")
## # A tibble: 1 x 4
## row col expected actual
## <int> <int> <chr> <chr>
## 1 3 NA value in level set bananana
## Levels: apple banana
parse_datetime("2010-10-01T2010")
## [1] "2010-10-01 20:10:00 UTC"
parse_datetime("20101010")
## [1] "2010-10-10 UTC"
parse_date("2010-10-01")
## [1] "2010-10-01"
library(hms)
parse_time("01:10 am")
## 01:10:00
parse_time("20:10:01")
## 20:10:01
parse_date("01/02/15","%m/%d/%y")
## [1] "2015-01-02"
parse_date("01/02/15","%d/%m/%y")
## [1] "2015-02-01"
parse_date("01/02/15","%y/%m/%d")
## [1] "2001-02-15"
parse_date("1 janvier 2015", "%d %B %Y", locale = locale("fr"))
## [1] "2015-01-01"
guess_parser("2010-10-01")
## [1] "date"
guess_parser("15:01")
## [1] "time"
guess_parser(c("TRUE", "FALSE"))
## [1] "logical"
guess_parser(c("1","5","9"))
## [1] "double"
guess_parser(c("12, 352, 561"))
## [1] "character"
str(parse_guess("2010-10-10"))
## Date[1:1], format: "2010-10-10"
challenge <- read_csv(readr_example("challenge.csv"))
## Parsed with column specification:
## cols(
## x = col_double(),
## y = col_logical()
## )
## Warning: 1000 parsing failures.
## row col expected actual file
## 1001 y 1/0/T/F/TRUE/FALSE 2015-01-16 '/Library/Frameworks/R.framework/Versions/3.5/Resources/library/readr/extdata/challenge.csv'
## 1002 y 1/0/T/F/TRUE/FALSE 2018-05-18 '/Library/Frameworks/R.framework/Versions/3.5/Resources/library/readr/extdata/challenge.csv'
## 1003 y 1/0/T/F/TRUE/FALSE 2015-09-05 '/Library/Frameworks/R.framework/Versions/3.5/Resources/library/readr/extdata/challenge.csv'
## 1004 y 1/0/T/F/TRUE/FALSE 2012-11-28 '/Library/Frameworks/R.framework/Versions/3.5/Resources/library/readr/extdata/challenge.csv'
## 1005 y 1/0/T/F/TRUE/FALSE 2020-01-13 '/Library/Frameworks/R.framework/Versions/3.5/Resources/library/readr/extdata/challenge.csv'
## .... ... .................. .......... ............................................................................................
## See problems(...) for more details.
problems(challenge)
## # A tibble: 1,000 x 5
## row col expected actual file
## <int> <chr> <chr> <chr> <chr>
## 1 1001 y 1/0/T/F/TRUE… 2015-01… '/Library/Frameworks/R.framework/Ver…
## 2 1002 y 1/0/T/F/TRUE… 2018-05… '/Library/Frameworks/R.framework/Ver…
## 3 1003 y 1/0/T/F/TRUE… 2015-09… '/Library/Frameworks/R.framework/Ver…
## 4 1004 y 1/0/T/F/TRUE… 2012-11… '/Library/Frameworks/R.framework/Ver…
## 5 1005 y 1/0/T/F/TRUE… 2020-01… '/Library/Frameworks/R.framework/Ver…
## 6 1006 y 1/0/T/F/TRUE… 2016-04… '/Library/Frameworks/R.framework/Ver…
## 7 1007 y 1/0/T/F/TRUE… 2011-05… '/Library/Frameworks/R.framework/Ver…
## 8 1008 y 1/0/T/F/TRUE… 2020-07… '/Library/Frameworks/R.framework/Ver…
## 9 1009 y 1/0/T/F/TRUE… 2011-04… '/Library/Frameworks/R.framework/Ver…
## 10 1010 y 1/0/T/F/TRUE… 2010-05… '/Library/Frameworks/R.framework/Ver…
## # … with 990 more rows
challenge <- read_csv(readr_example("challenge.csv"),
col_types = cols(
x = col_integer(),
y = col_character()
))
## Warning: 1000 parsing failures.
## row col expected actual file
## 1001 x no trailing characters .23837975086644292 '/Library/Frameworks/R.framework/Versions/3.5/Resources/library/readr/extdata/challenge.csv'
## 1002 x no trailing characters .41167997173033655 '/Library/Frameworks/R.framework/Versions/3.5/Resources/library/readr/extdata/challenge.csv'
## 1003 x no trailing characters .7460716762579978 '/Library/Frameworks/R.framework/Versions/3.5/Resources/library/readr/extdata/challenge.csv'
## 1004 x no trailing characters .723450553836301 '/Library/Frameworks/R.framework/Versions/3.5/Resources/library/readr/extdata/challenge.csv'
## 1005 x no trailing characters .614524137461558 '/Library/Frameworks/R.framework/Versions/3.5/Resources/library/readr/extdata/challenge.csv'
## .... ... ...................... .................. ............................................................................................
## See problems(...) for more details.
challenge <- read_csv(
readr_example("challenge.csv"),
col_types = cols(
x = col_double(),
y = col_character()
)
)
tail(challenge)
## # A tibble: 6 x 2
## x y
## <dbl> <chr>
## 1 0.805 2019-11-21
## 2 0.164 2018-03-29
## 3 0.472 2014-08-04
## 4 0.718 2015-08-16
## 5 0.270 2020-02-04
## 6 0.608 2019-01-06
challenge <- read_csv(
readr_example("challenge.csv"),
col_type = cols(
x = col_double(),
y = col_date()
)
)
tail(challenge)
## # A tibble: 6 x 2
## x y
## <dbl> <date>
## 1 0.805 2019-11-21
## 2 0.164 2018-03-29
## 3 0.472 2014-08-04
## 4 0.718 2015-08-16
## 5 0.270 2020-02-04
## 6 0.608 2019-01-06
challenge2 <- read_csv(
readr_example("challenge.csv"),
guess_max = 1001
)
## Parsed with column specification:
## cols(
## x = col_double(),
## y = col_date(format = "")
## )
challenge2
## # A tibble: 2,000 x 2
## x y
## <dbl> <date>
## 1 404 NA
## 2 4172 NA
## 3 3004 NA
## 4 787 NA
## 5 37 NA
## 6 2332 NA
## 7 2489 NA
## 8 1449 NA
## 9 3665 NA
## 10 3863 NA
## # … with 1,990 more rows
challenge2 <- read_csv(readr_example("challenge.csv"),
col_types = cols(.default = col_character()))
df <- tribble(
~x, ~y,
"1", "1.21",
"2", "2.32",
"3", "4.56"
)
df
## # A tibble: 3 x 2
## x y
## <chr> <chr>
## 1 1 1.21
## 2 2 2.32
## 3 3 4.56
type_convert(df)
## Parsed with column specification:
## cols(
## x = col_double(),
## y = col_double()
## )
## # A tibble: 3 x 2
## x y
## <dbl> <dbl>
## 1 1 1.21
## 2 2 2.32
## 3 3 4.56
write_csv(challenge,"challenge.csv")
challenge
## # A tibble: 2,000 x 2
## x y
## <dbl> <date>
## 1 404 NA
## 2 4172 NA
## 3 3004 NA
## 4 787 NA
## 5 37 NA
## 6 2332 NA
## 7 2489 NA
## 8 1449 NA
## 9 3665 NA
## 10 3863 NA
## # … with 1,990 more rows
write_csv(challenge, "challenge-2.csv")
read_csv("challenge-2.csv")
## Parsed with column specification:
## cols(
## x = col_double(),
## y = col_logical()
## )
## Warning: 1000 parsing failures.
## row col expected actual file
## 1001 y 1/0/T/F/TRUE/FALSE 2015-01-16 'challenge-2.csv'
## 1002 y 1/0/T/F/TRUE/FALSE 2018-05-18 'challenge-2.csv'
## 1003 y 1/0/T/F/TRUE/FALSE 2015-09-05 'challenge-2.csv'
## 1004 y 1/0/T/F/TRUE/FALSE 2012-11-28 'challenge-2.csv'
## 1005 y 1/0/T/F/TRUE/FALSE 2020-01-13 'challenge-2.csv'
## .... ... .................. .......... .................
## See problems(...) for more details.
## # A tibble: 2,000 x 2
## x y
## <dbl> <lgl>
## 1 404 NA
## 2 4172 NA
## 3 3004 NA
## 4 787 NA
## 5 37 NA
## 6 2332 NA
## 7 2489 NA
## 8 1449 NA
## 9 3665 NA
## 10 3863 NA
## # … with 1,990 more rows
write_rds(challenge, "challenge.rds")
read_rds("challenge.rds")
## # A tibble: 2,000 x 2
## x y
## <dbl> <date>
## 1 404 NA
## 2 4172 NA
## 3 3004 NA
## 4 787 NA
## 5 37 NA
## 6 2332 NA
## 7 2489 NA
## 8 1449 NA
## 9 3665 NA
## 10 3863 NA
## # … with 1,990 more rows
library(feather)
write_feather(challenge, "challenge.feather")
read_feather("challenge.feather")
## # A tibble: 2,000 x 2
## x y
## <dbl> <date>
## 1 404 NA
## 2 4172 NA
## 3 3004 NA
## 4 787 NA
## 5 37 NA
## 6 2332 NA
## 7 2489 NA
## 8 1449 NA
## 9 3665 NA
## 10 3863 NA
## # … with 1,990 more rows
9. Tidy data with tidyr
table1
## # A tibble: 6 x 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
table2
## # A tibble: 12 x 4
## country year type count
## <chr> <int> <chr> <int>
## 1 Afghanistan 1999 cases 745
## 2 Afghanistan 1999 population 19987071
## 3 Afghanistan 2000 cases 2666
## 4 Afghanistan 2000 population 20595360
## 5 Brazil 1999 cases 37737
## 6 Brazil 1999 population 172006362
## 7 Brazil 2000 cases 80488
## 8 Brazil 2000 population 174504898
## 9 China 1999 cases 212258
## 10 China 1999 population 1272915272
## 11 China 2000 cases 213766
## 12 China 2000 population 1280428583
table3
## # A tibble: 6 x 3
## country year rate
## * <chr> <int> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
table4a
## # A tibble: 3 x 3
## country `1999` `2000`
## * <chr> <int> <int>
## 1 Afghanistan 745 2666
## 2 Brazil 37737 80488
## 3 China 212258 213766
table4b
## # A tibble: 3 x 3
## country `1999` `2000`
## * <chr> <int> <int>
## 1 Afghanistan 19987071 20595360
## 2 Brazil 172006362 174504898
## 3 China 1272915272 1280428583
table1 %>%
mutate(rate = cases / population * 10000)
## # A tibble: 6 x 5
## country year cases population rate
## <chr> <int> <int> <int> <dbl>
## 1 Afghanistan 1999 745 19987071 0.373
## 2 Afghanistan 2000 2666 20595360 1.29
## 3 Brazil 1999 37737 172006362 2.19
## 4 Brazil 2000 80488 174504898 4.61
## 5 China 1999 212258 1272915272 1.67
## 6 China 2000 213766 1280428583 1.67
table1 %>%
count(year, wt = cases)
## # A tibble: 2 x 2
## year n
## <int> <int>
## 1 1999 250740
## 2 2000 296920
library(ggplot2)
ggplot(table1, aes(year, cases)) +
geom_line(aes(group = country), color = "grey50") +
geom_point(aes(color = country))

table4a
## # A tibble: 3 x 3
## country `1999` `2000`
## * <chr> <int> <int>
## 1 Afghanistan 745 2666
## 2 Brazil 37737 80488
## 3 China 212258 213766
table4a %>%
gather(`1999`,`2000`,key = "year", value = "cases")
## # A tibble: 6 x 3
## country year cases
## <chr> <chr> <int>
## 1 Afghanistan 1999 745
## 2 Brazil 1999 37737
## 3 China 1999 212258
## 4 Afghanistan 2000 2666
## 5 Brazil 2000 80488
## 6 China 2000 213766
table4b
## # A tibble: 3 x 3
## country `1999` `2000`
## * <chr> <int> <int>
## 1 Afghanistan 19987071 20595360
## 2 Brazil 172006362 174504898
## 3 China 1272915272 1280428583
table4b %>%
gather(`1999`, `2000`, key = "year", value = "population")
## # A tibble: 6 x 3
## country year population
## <chr> <chr> <int>
## 1 Afghanistan 1999 19987071
## 2 Brazil 1999 172006362
## 3 China 1999 1272915272
## 4 Afghanistan 2000 20595360
## 5 Brazil 2000 174504898
## 6 China 2000 1280428583
tidy4a <- table4a %>%
gather(`1999`, `2000`, key = "year", value = "cases")
tidy4b <- table4b %>%
gather(`1999`, `2000`, key = "year", value = "population")
left_join(tidy4a, tidy4b)
## Joining, by = c("country", "year")
## # A tibble: 6 x 4
## country year cases population
## <chr> <chr> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Brazil 1999 37737 172006362
## 3 China 1999 212258 1272915272
## 4 Afghanistan 2000 2666 20595360
## 5 Brazil 2000 80488 174504898
## 6 China 2000 213766 1280428583
table2
## # A tibble: 12 x 4
## country year type count
## <chr> <int> <chr> <int>
## 1 Afghanistan 1999 cases 745
## 2 Afghanistan 1999 population 19987071
## 3 Afghanistan 2000 cases 2666
## 4 Afghanistan 2000 population 20595360
## 5 Brazil 1999 cases 37737
## 6 Brazil 1999 population 172006362
## 7 Brazil 2000 cases 80488
## 8 Brazil 2000 population 174504898
## 9 China 1999 cases 212258
## 10 China 1999 population 1272915272
## 11 China 2000 cases 213766
## 12 China 2000 population 1280428583
spread(table2, key = type, value = count)
## # A tibble: 6 x 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
table3
## # A tibble: 6 x 3
## country year rate
## * <chr> <int> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
table3 %>%
separate(rate, into = c("cases", "population"))
## # A tibble: 6 x 4
## country year cases population
## <chr> <int> <chr> <chr>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
table3 %>%
separate(rate, into = c("cases", "population"), sep = "/")
## # A tibble: 6 x 4
## country year cases population
## <chr> <int> <chr> <chr>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
table3 %>%
separate(
rate,
into = c("cases","population"),
convert = TRUE
)
## # A tibble: 6 x 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
table5
## # A tibble: 6 x 4
## country century year rate
## * <chr> <chr> <chr> <chr>
## 1 Afghanistan 19 99 745/19987071
## 2 Afghanistan 20 00 2666/20595360
## 3 Brazil 19 99 37737/172006362
## 4 Brazil 20 00 80488/174504898
## 5 China 19 99 212258/1272915272
## 6 China 20 00 213766/1280428583
table5 %>%
unite(new, century, year)
## # A tibble: 6 x 3
## country new rate
## <chr> <chr> <chr>
## 1 Afghanistan 19_99 745/19987071
## 2 Afghanistan 20_00 2666/20595360
## 3 Brazil 19_99 37737/172006362
## 4 Brazil 20_00 80488/174504898
## 5 China 19_99 212258/1272915272
## 6 China 20_00 213766/1280428583
table5 %>%
unite(new, century, year, sep = "")
## # A tibble: 6 x 3
## country new rate
## <chr> <chr> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
stocks <- tibble(
year = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
qtr = c(1, 2, 3, 4, 2, 3, 4),
return = c(1.88, 0.59, 0.35, NA, 0.92, 0.17, 2.66)
)
stocks
## # A tibble: 7 x 3
## year qtr return
## <dbl> <dbl> <dbl>
## 1 2015 1 1.88
## 2 2015 2 0.59
## 3 2015 3 0.35
## 4 2015 4 NA
## 5 2016 2 0.92
## 6 2016 3 0.17
## 7 2016 4 2.66
stocks %>%
spread(year, return)
## # A tibble: 4 x 3
## qtr `2015` `2016`
## <dbl> <dbl> <dbl>
## 1 1 1.88 NA
## 2 2 0.59 0.92
## 3 3 0.35 0.17
## 4 4 NA 2.66
stocks %>%
spread(year, return) %>%
gather(year, return, `2015`, `2016`, na.rm = TRUE)
## # A tibble: 6 x 3
## qtr year return
## <dbl> <chr> <dbl>
## 1 1 2015 1.88
## 2 2 2015 0.59
## 3 3 2015 0.35
## 4 2 2016 0.92
## 5 3 2016 0.17
## 6 4 2016 2.66
stocks %>%
complete(year, qtr)
## # A tibble: 8 x 3
## year qtr return
## <dbl> <dbl> <dbl>
## 1 2015 1 1.88
## 2 2015 2 0.59
## 3 2015 3 0.35
## 4 2015 4 NA
## 5 2016 1 NA
## 6 2016 2 0.92
## 7 2016 3 0.17
## 8 2016 4 2.66
treatment <- tribble(
~ person, ~ treatment, ~ response,
"Derrick Whitmore", 1, 7,
NA, 2, 10,
NA, 3, 9,
"Katherine Burke", 1, 4
)
treatment
## # A tibble: 4 x 3
## person treatment response
## <chr> <dbl> <dbl>
## 1 Derrick Whitmore 1 7
## 2 <NA> 2 10
## 3 <NA> 3 9
## 4 Katherine Burke 1 4
treatment %>%
fill(person)
## # A tibble: 4 x 3
## person treatment response
## <chr> <dbl> <dbl>
## 1 Derrick Whitmore 1 7
## 2 Derrick Whitmore 2 10
## 3 Derrick Whitmore 3 9
## 4 Katherine Burke 1 4
who
## # A tibble: 7,240 x 60
## country iso2 iso3 year new_sp_m014 new_sp_m1524 new_sp_m2534
## <chr> <chr> <chr> <int> <int> <int> <int>
## 1 Afghan… AF AFG 1980 NA NA NA
## 2 Afghan… AF AFG 1981 NA NA NA
## 3 Afghan… AF AFG 1982 NA NA NA
## 4 Afghan… AF AFG 1983 NA NA NA
## 5 Afghan… AF AFG 1984 NA NA NA
## 6 Afghan… AF AFG 1985 NA NA NA
## 7 Afghan… AF AFG 1986 NA NA NA
## 8 Afghan… AF AFG 1987 NA NA NA
## 9 Afghan… AF AFG 1988 NA NA NA
## 10 Afghan… AF AFG 1989 NA NA NA
## # … with 7,230 more rows, and 53 more variables: new_sp_m3544 <int>,
## # new_sp_m4554 <int>, new_sp_m5564 <int>, new_sp_m65 <int>,
## # new_sp_f014 <int>, new_sp_f1524 <int>, new_sp_f2534 <int>,
## # new_sp_f3544 <int>, new_sp_f4554 <int>, new_sp_f5564 <int>,
## # new_sp_f65 <int>, new_sn_m014 <int>, new_sn_m1524 <int>,
## # new_sn_m2534 <int>, new_sn_m3544 <int>, new_sn_m4554 <int>,
## # new_sn_m5564 <int>, new_sn_m65 <int>, new_sn_f014 <int>,
## # new_sn_f1524 <int>, new_sn_f2534 <int>, new_sn_f3544 <int>,
## # new_sn_f4554 <int>, new_sn_f5564 <int>, new_sn_f65 <int>,
## # new_ep_m014 <int>, new_ep_m1524 <int>, new_ep_m2534 <int>,
## # new_ep_m3544 <int>, new_ep_m4554 <int>, new_ep_m5564 <int>,
## # new_ep_m65 <int>, new_ep_f014 <int>, new_ep_f1524 <int>,
## # new_ep_f2534 <int>, new_ep_f3544 <int>, new_ep_f4554 <int>,
## # new_ep_f5564 <int>, new_ep_f65 <int>, newrel_m014 <int>,
## # newrel_m1524 <int>, newrel_m2534 <int>, newrel_m3544 <int>,
## # newrel_m4554 <int>, newrel_m5564 <int>, newrel_m65 <int>,
## # newrel_f014 <int>, newrel_f1524 <int>, newrel_f2534 <int>,
## # newrel_f3544 <int>, newrel_f4554 <int>, newrel_f5564 <int>,
## # newrel_f65 <int>
who1 <- who %>%
gather(new_sp_m014:newrel_f65,
key = "key",
value = "cases",
na.rm = TRUE)
who1
## # A tibble: 76,046 x 6
## country iso2 iso3 year key cases
## <chr> <chr> <chr> <int> <chr> <int>
## 1 Afghanistan AF AFG 1997 new_sp_m014 0
## 2 Afghanistan AF AFG 1998 new_sp_m014 30
## 3 Afghanistan AF AFG 1999 new_sp_m014 8
## 4 Afghanistan AF AFG 2000 new_sp_m014 52
## 5 Afghanistan AF AFG 2001 new_sp_m014 129
## 6 Afghanistan AF AFG 2002 new_sp_m014 90
## 7 Afghanistan AF AFG 2003 new_sp_m014 127
## 8 Afghanistan AF AFG 2004 new_sp_m014 139
## 9 Afghanistan AF AFG 2005 new_sp_m014 151
## 10 Afghanistan AF AFG 2006 new_sp_m014 193
## # … with 76,036 more rows
who1 %>%
count(key)
## # A tibble: 56 x 2
## key n
## <chr> <int>
## 1 new_ep_f014 1032
## 2 new_ep_f1524 1021
## 3 new_ep_f2534 1021
## 4 new_ep_f3544 1021
## 5 new_ep_f4554 1017
## 6 new_ep_f5564 1017
## 7 new_ep_f65 1014
## 8 new_ep_m014 1038
## 9 new_ep_m1524 1026
## 10 new_ep_m2534 1020
## # … with 46 more rows
who2 <- who1 %>%
mutate(key = stringr::str_replace(key, "newrel", "new_rel"))
who2
## # A tibble: 76,046 x 6
## country iso2 iso3 year key cases
## <chr> <chr> <chr> <int> <chr> <int>
## 1 Afghanistan AF AFG 1997 new_sp_m014 0
## 2 Afghanistan AF AFG 1998 new_sp_m014 30
## 3 Afghanistan AF AFG 1999 new_sp_m014 8
## 4 Afghanistan AF AFG 2000 new_sp_m014 52
## 5 Afghanistan AF AFG 2001 new_sp_m014 129
## 6 Afghanistan AF AFG 2002 new_sp_m014 90
## 7 Afghanistan AF AFG 2003 new_sp_m014 127
## 8 Afghanistan AF AFG 2004 new_sp_m014 139
## 9 Afghanistan AF AFG 2005 new_sp_m014 151
## 10 Afghanistan AF AFG 2006 new_sp_m014 193
## # … with 76,036 more rows
who3 <- who2 %>%
separate(key, c("new", "type", "sexage"), sep = "_")
who3
## # A tibble: 76,046 x 8
## country iso2 iso3 year new type sexage cases
## <chr> <chr> <chr> <int> <chr> <chr> <chr> <int>
## 1 Afghanistan AF AFG 1997 new sp m014 0
## 2 Afghanistan AF AFG 1998 new sp m014 30
## 3 Afghanistan AF AFG 1999 new sp m014 8
## 4 Afghanistan AF AFG 2000 new sp m014 52
## 5 Afghanistan AF AFG 2001 new sp m014 129
## 6 Afghanistan AF AFG 2002 new sp m014 90
## 7 Afghanistan AF AFG 2003 new sp m014 127
## 8 Afghanistan AF AFG 2004 new sp m014 139
## 9 Afghanistan AF AFG 2005 new sp m014 151
## 10 Afghanistan AF AFG 2006 new sp m014 193
## # … with 76,036 more rows
who3 %>%
count(new)
## # A tibble: 1 x 2
## new n
## <chr> <int>
## 1 new 76046
who4 <- who3 %>%
select(-new, -iso2, -iso3)
who5 <- who4 %>%
separate(sexage, c("sex","age"),sep = 1)
who5
## # A tibble: 76,046 x 6
## country year type sex age cases
## <chr> <int> <chr> <chr> <chr> <int>
## 1 Afghanistan 1997 sp m 014 0
## 2 Afghanistan 1998 sp m 014 30
## 3 Afghanistan 1999 sp m 014 8
## 4 Afghanistan 2000 sp m 014 52
## 5 Afghanistan 2001 sp m 014 129
## 6 Afghanistan 2002 sp m 014 90
## 7 Afghanistan 2003 sp m 014 127
## 8 Afghanistan 2004 sp m 014 139
## 9 Afghanistan 2005 sp m 014 151
## 10 Afghanistan 2006 sp m 014 193
## # … with 76,036 more rows
who %>%
gather(code, value, new_sp_m014:newrel_f65, na.rm = TRUE) %>%
mutate(code = stringr::str_replace(code, "newrel", "new_rel")) %>%
separate(code, c("new", "var", "sexage")) %>%
select(-new, -iso2, -iso3) %>%
separate(sexage, c("sex", "age"), sep = 1)
## # A tibble: 76,046 x 6
## country year var sex age value
## <chr> <int> <chr> <chr> <chr> <int>
## 1 Afghanistan 1997 sp m 014 0
## 2 Afghanistan 1998 sp m 014 30
## 3 Afghanistan 1999 sp m 014 8
## 4 Afghanistan 2000 sp m 014 52
## 5 Afghanistan 2001 sp m 014 129
## 6 Afghanistan 2002 sp m 014 90
## 7 Afghanistan 2003 sp m 014 127
## 8 Afghanistan 2004 sp m 014 139
## 9 Afghanistan 2005 sp m 014 151
## 10 Afghanistan 2006 sp m 014 193
## # … with 76,036 more rows
10. Relational data with dplyr
library(tidyverse)
library(nycflights13)
airlines
## # A tibble: 16 x 2
## carrier name
## <chr> <chr>
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
## 4 B6 JetBlue Airways
## 5 DL Delta Air Lines Inc.
## 6 EV ExpressJet Airlines Inc.
## 7 F9 Frontier Airlines Inc.
## 8 FL AirTran Airways Corporation
## 9 HA Hawaiian Airlines Inc.
## 10 MQ Envoy Air
## 11 OO SkyWest Airlines Inc.
## 12 UA United Air Lines Inc.
## 13 US US Airways Inc.
## 14 VX Virgin America
## 15 WN Southwest Airlines Co.
## 16 YV Mesa Airlines Inc.
airports
## # A tibble: 1,458 x 8
## faa name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <int> <dbl> <chr> <chr>
## 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/New_…
## 2 06A Moton Field Municipa… 32.5 -85.7 264 -6 A America/Chic…
## 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/Chic…
## 4 06N Randall Airport 41.4 -74.4 523 -5 A America/New_…
## 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A America/New_…
## 6 0A9 Elizabethton Municip… 36.4 -82.2 1593 -5 A America/New_…
## 7 0G6 Williams County Airp… 41.5 -84.5 730 -5 A America/New_…
## 8 0G7 Finger Lakes Regiona… 42.9 -76.8 492 -5 A America/New_…
## 9 0P2 Shoestring Aviation … 39.8 -76.6 1000 -5 U America/New_…
## 10 0S9 Jefferson County Intl 48.1 -123. 108 -8 A America/Los_…
## # … with 1,448 more rows
planes
## # A tibble: 3,322 x 9
## tailnum year type manufacturer model engines seats speed engine
## <chr> <int> <chr> <chr> <chr> <int> <int> <int> <chr>
## 1 N10156 2004 Fixed win… EMBRAER EMB-1… 2 55 NA Turbo…
## 2 N102UW 1998 Fixed win… AIRBUS INDUS… A320-… 2 182 NA Turbo…
## 3 N103US 1999 Fixed win… AIRBUS INDUS… A320-… 2 182 NA Turbo…
## 4 N104UW 1999 Fixed win… AIRBUS INDUS… A320-… 2 182 NA Turbo…
## 5 N10575 2002 Fixed win… EMBRAER EMB-1… 2 55 NA Turbo…
## 6 N105UW 1999 Fixed win… AIRBUS INDUS… A320-… 2 182 NA Turbo…
## 7 N107US 1999 Fixed win… AIRBUS INDUS… A320-… 2 182 NA Turbo…
## 8 N108UW 1999 Fixed win… AIRBUS INDUS… A320-… 2 182 NA Turbo…
## 9 N109UW 1999 Fixed win… AIRBUS INDUS… A320-… 2 182 NA Turbo…
## 10 N110UW 1999 Fixed win… AIRBUS INDUS… A320-… 2 182 NA Turbo…
## # … with 3,312 more rows
weather
## # A tibble: 26,115 x 15
## origin year month day hour temp dewp humid wind_dir wind_speed
## <chr> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4
## 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06
## 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5
## 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7
## 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7
## 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
## 7 EWR 2013 1 1 7 39.0 28.0 64.4 240 15.0
## 8 EWR 2013 1 1 8 39.9 28.0 62.2 250 10.4
## 9 EWR 2013 1 1 9 39.9 28.0 62.2 260 15.0
## 10 EWR 2013 1 1 10 41 28.0 59.6 260 13.8
## # … with 26,105 more rows, and 5 more variables: wind_gust <dbl>,
## # precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>
planes %>%
count(tailnum) %>%
filter(n > 1)
## # A tibble: 0 x 2
## # … with 2 variables: tailnum <chr>, n <int>
weather %>%
count(year, month, day, hour, origin) %>%
filter(n > 1)
## # A tibble: 3 x 6
## year month day hour origin n
## <dbl> <dbl> <int> <int> <chr> <int>
## 1 2013 11 3 1 EWR 2
## 2 2013 11 3 1 JFK 2
## 3 2013 11 3 1 LGA 2
flights %>%
count(year, month, day, flight) %>%
filter(n > 1)
## # A tibble: 29,768 x 5
## year month day flight n
## <int> <int> <int> <int> <int>
## 1 2013 1 1 1 2
## 2 2013 1 1 3 2
## 3 2013 1 1 4 2
## 4 2013 1 1 11 3
## 5 2013 1 1 15 2
## 6 2013 1 1 21 2
## 7 2013 1 1 27 4
## 8 2013 1 1 31 2
## 9 2013 1 1 32 2
## 10 2013 1 1 35 2
## # … with 29,758 more rows
flights %>%
count(year, month, day, tailnum) %>%
filter(n > 1)
## # A tibble: 64,928 x 5
## year month day tailnum n
## <int> <int> <int> <chr> <int>
## 1 2013 1 1 N0EGMQ 2
## 2 2013 1 1 N11189 2
## 3 2013 1 1 N11536 2
## 4 2013 1 1 N11544 3
## 5 2013 1 1 N11551 2
## 6 2013 1 1 N12540 2
## 7 2013 1 1 N12567 2
## 8 2013 1 1 N13123 2
## 9 2013 1 1 N13538 3
## 10 2013 1 1 N13566 3
## # … with 64,918 more rows
flights2 <- flights %>%
select(year:day, hour, origin, dest, tailnum, carrier)
flights2
## # A tibble: 336,776 x 8
## year month day hour origin dest tailnum carrier
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA
## 2 2013 1 1 5 LGA IAH N24211 UA
## 3 2013 1 1 5 JFK MIA N619AA AA
## 4 2013 1 1 5 JFK BQN N804JB B6
## 5 2013 1 1 6 LGA ATL N668DN DL
## 6 2013 1 1 5 EWR ORD N39463 UA
## 7 2013 1 1 6 EWR FLL N516JB B6
## 8 2013 1 1 6 LGA IAD N829AS EV
## 9 2013 1 1 6 JFK MCO N593JB B6
## 10 2013 1 1 6 LGA ORD N3ALAA AA
## # … with 336,766 more rows
flights2 %>%
select(-origin, -dest) %>%
left_join(airlines, by = "carrier")
## # A tibble: 336,776 x 7
## year month day hour tailnum carrier name
## <int> <int> <int> <dbl> <chr> <chr> <chr>
## 1 2013 1 1 5 N14228 UA United Air Lines Inc.
## 2 2013 1 1 5 N24211 UA United Air Lines Inc.
## 3 2013 1 1 5 N619AA AA American Airlines Inc.
## 4 2013 1 1 5 N804JB B6 JetBlue Airways
## 5 2013 1 1 6 N668DN DL Delta Air Lines Inc.
## 6 2013 1 1 5 N39463 UA United Air Lines Inc.
## 7 2013 1 1 6 N516JB B6 JetBlue Airways
## 8 2013 1 1 6 N829AS EV ExpressJet Airlines Inc.
## 9 2013 1 1 6 N593JB B6 JetBlue Airways
## 10 2013 1 1 6 N3ALAA AA American Airlines Inc.
## # … with 336,766 more rows
flights2 %>%
select(-origin, -dest) %>%
mutate(name = airlines$name[match(carrier, airlines$carrier)])
## # A tibble: 336,776 x 7
## year month day hour tailnum carrier name
## <int> <int> <int> <dbl> <chr> <chr> <chr>
## 1 2013 1 1 5 N14228 UA United Air Lines Inc.
## 2 2013 1 1 5 N24211 UA United Air Lines Inc.
## 3 2013 1 1 5 N619AA AA American Airlines Inc.
## 4 2013 1 1 5 N804JB B6 JetBlue Airways
## 5 2013 1 1 6 N668DN DL Delta Air Lines Inc.
## 6 2013 1 1 5 N39463 UA United Air Lines Inc.
## 7 2013 1 1 6 N516JB B6 JetBlue Airways
## 8 2013 1 1 6 N829AS EV ExpressJet Airlines Inc.
## 9 2013 1 1 6 N593JB B6 JetBlue Airways
## 10 2013 1 1 6 N3ALAA AA American Airlines Inc.
## # … with 336,766 more rows
x <- tribble(
~ key, ~val_X,
1, "x1",
2, "x2",
3, "x3"
)
y <- tribble(
~ key, ~ val_y,
1, "y1",
2, "y2",
4, "y3"
)
x %>%
inner_join(y, by = "key")
## # A tibble: 2 x 3
## key val_X val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
x <- tribble(
~ key, ~val_x,
1, "x1",
2, "x2",
2, "x3",
1, "x4"
)
y <- tribble(
~ key, ~val_y,
1, "y1",
2, "y2"
)
left_join(x, y, by = "key")
## # A tibble: 4 x 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 2 x3 y2
## 4 1 x4 y1
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
2, "x3",
3, "x4"
)
y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2",
2, "y3",
3, "y4"
)
left_join(x, y, by = "key")
## # A tibble: 6 x 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 2 x2 y3
## 4 2 x3 y2
## 5 2 x3 y3
## 6 3 x4 y4
flights2 %>%
left_join(weather)
## Joining, by = c("year", "month", "day", "hour", "origin")
## # A tibble: 336,776 x 18
## year month day hour origin dest tailnum carrier temp dewp humid
## <dbl> <dbl> <int> <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH N14228 UA 39.0 28.0 64.4
## 2 2013 1 1 5 LGA IAH N24211 UA 39.9 25.0 54.8
## 3 2013 1 1 5 JFK MIA N619AA AA 39.0 27.0 61.6
## 4 2013 1 1 5 JFK BQN N804JB B6 39.0 27.0 61.6
## 5 2013 1 1 6 LGA ATL N668DN DL 39.9 25.0 54.8
## 6 2013 1 1 5 EWR ORD N39463 UA 39.0 28.0 64.4
## 7 2013 1 1 6 EWR FLL N516JB B6 37.9 28.0 67.2
## 8 2013 1 1 6 LGA IAD N829AS EV 39.9 25.0 54.8
## 9 2013 1 1 6 JFK MCO N593JB B6 37.9 27.0 64.3
## 10 2013 1 1 6 LGA ORD N3ALAA AA 39.9 25.0 54.8
## # … with 336,766 more rows, and 7 more variables: wind_dir <dbl>,
## # wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## # visib <dbl>, time_hour <dttm>
flights2 %>%
left_join(planes, by = "tailnum")
## # A tibble: 336,776 x 16
## year.x month day hour origin dest tailnum carrier year.y type
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <int> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA 1999 Fixe…
## 2 2013 1 1 5 LGA IAH N24211 UA 1998 Fixe…
## 3 2013 1 1 5 JFK MIA N619AA AA 1990 Fixe…
## 4 2013 1 1 5 JFK BQN N804JB B6 2012 Fixe…
## 5 2013 1 1 6 LGA ATL N668DN DL 1991 Fixe…
## 6 2013 1 1 5 EWR ORD N39463 UA 2012 Fixe…
## 7 2013 1 1 6 EWR FLL N516JB B6 2000 Fixe…
## 8 2013 1 1 6 LGA IAD N829AS EV 1998 Fixe…
## 9 2013 1 1 6 JFK MCO N593JB B6 2004 Fixe…
## 10 2013 1 1 6 LGA ORD N3ALAA AA NA <NA>
## # … with 336,766 more rows, and 6 more variables: manufacturer <chr>,
## # model <chr>, engines <int>, seats <int>, speed <int>, engine <chr>
flights2 %>%
left_join(airports, c("dest" = "faa"))
## # A tibble: 336,776 x 15
## year month day hour origin dest tailnum carrier name lat lon
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH N14228 UA Geor… 30.0 -95.3
## 2 2013 1 1 5 LGA IAH N24211 UA Geor… 30.0 -95.3
## 3 2013 1 1 5 JFK MIA N619AA AA Miam… 25.8 -80.3
## 4 2013 1 1 5 JFK BQN N804JB B6 <NA> NA NA
## 5 2013 1 1 6 LGA ATL N668DN DL Hart… 33.6 -84.4
## 6 2013 1 1 5 EWR ORD N39463 UA Chic… 42.0 -87.9
## 7 2013 1 1 6 EWR FLL N516JB B6 Fort… 26.1 -80.2
## 8 2013 1 1 6 LGA IAD N829AS EV Wash… 38.9 -77.5
## 9 2013 1 1 6 JFK MCO N593JB B6 Orla… 28.4 -81.3
## 10 2013 1 1 6 LGA ORD N3ALAA AA Chic… 42.0 -87.9
## # … with 336,766 more rows, and 4 more variables: alt <int>, tz <dbl>,
## # dst <chr>, tzone <chr>
flights2 %>%
left_join(airports, c("origin" = "faa"))
## # A tibble: 336,776 x 15
## year month day hour origin dest tailnum carrier name lat lon
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH N14228 UA Newa… 40.7 -74.2
## 2 2013 1 1 5 LGA IAH N24211 UA La G… 40.8 -73.9
## 3 2013 1 1 5 JFK MIA N619AA AA John… 40.6 -73.8
## 4 2013 1 1 5 JFK BQN N804JB B6 John… 40.6 -73.8
## 5 2013 1 1 6 LGA ATL N668DN DL La G… 40.8 -73.9
## 6 2013 1 1 5 EWR ORD N39463 UA Newa… 40.7 -74.2
## 7 2013 1 1 6 EWR FLL N516JB B6 Newa… 40.7 -74.2
## 8 2013 1 1 6 LGA IAD N829AS EV La G… 40.8 -73.9
## 9 2013 1 1 6 JFK MCO N593JB B6 John… 40.6 -73.8
## 10 2013 1 1 6 LGA ORD N3ALAA AA La G… 40.8 -73.9
## # … with 336,766 more rows, and 4 more variables: alt <int>, tz <dbl>,
## # dst <chr>, tzone <chr>
top_dest <- flights %>%
count(dest, sort = TRUE) %>%
head(10)
top_dest
## # A tibble: 10 x 2
## dest n
## <chr> <int>
## 1 ORD 17283
## 2 ATL 17215
## 3 LAX 16174
## 4 BOS 15508
## 5 MCO 14082
## 6 CLT 14064
## 7 SFO 13331
## 8 FLL 12055
## 9 MIA 11728
## 10 DCA 9705
flights %>%
filter(dest %in% top_dest$dest)
## # A tibble: 141,145 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 542 540 2 923
## 2 2013 1 1 554 600 -6 812
## 3 2013 1 1 554 558 -4 740
## 4 2013 1 1 555 600 -5 913
## 5 2013 1 1 557 600 -3 838
## 6 2013 1 1 558 600 -2 753
## 7 2013 1 1 558 600 -2 924
## 8 2013 1 1 558 600 -2 923
## 9 2013 1 1 559 559 0 702
## 10 2013 1 1 600 600 0 851
## # … with 141,135 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
flights %>%
semi_join(top_dest)
## Joining, by = "dest"
## # A tibble: 141,145 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 542 540 2 923
## 2 2013 1 1 554 600 -6 812
## 3 2013 1 1 554 558 -4 740
## 4 2013 1 1 555 600 -5 913
## 5 2013 1 1 557 600 -3 838
## 6 2013 1 1 558 600 -2 753
## 7 2013 1 1 558 600 -2 924
## 8 2013 1 1 558 600 -2 923
## 9 2013 1 1 559 559 0 702
## 10 2013 1 1 600 600 0 851
## # … with 141,135 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
flights %>%
anti_join(planes, by = "tailnum") %>%
count(tailnum, sort = TRUE)
## # A tibble: 722 x 2
## tailnum n
## <chr> <int>
## 1 <NA> 2512
## 2 N725MQ 575
## 3 N722MQ 513
## 4 N723MQ 507
## 5 N713MQ 483
## 6 N735MQ 396
## 7 N0EGMQ 371
## 8 N534MQ 364
## 9 N542MQ 363
## 10 N531MQ 349
## # … with 712 more rows
airports %>%
count(alt, lon) %>%
filter(n>1)
## # A tibble: 0 x 3
## # … with 3 variables: alt <int>, lon <dbl>, n <int>
df1 <- tribble(
~x, ~y,
1, 1,
2, 1
)
df2 <- tribble(
~x, ~y,
1, 1,
1, 2
)
intersect(df1, df2)
## # A tibble: 1 x 2
## x y
## <dbl> <dbl>
## 1 1 1
union(df1, df2)
## # A tibble: 3 x 2
## x y
## <dbl> <dbl>
## 1 1 1
## 2 2 1
## 3 1 2
setdiff(df1, df2)
## # A tibble: 1 x 2
## x y
## <dbl> <dbl>
## 1 2 1
setdiff(df2, df1)
## # A tibble: 1 x 2
## x y
## <dbl> <dbl>
## 1 1 2
11. Strings with stringr
library(tidyverse)
library(stringr)
string1 <- "This is a string"
string2 <- 'To put a "quote" inside a string, use single quotes'
double_quote <- "\"" # or '"'
single_quote <- '\'' # or "'"
x <- c("\"", "\\")
x
## [1] "\"" "\\"
writeLines(x)
## "
## \
x <- "\u00b5"
x
## [1] "µ"
c("one", "two", "three")
## [1] "one" "two" "three"
str_length(c("a","R for data science", NA))
## [1] 1 18 NA
str_c("x", "y")
## [1] "xy"
str_c("x","y","z")
## [1] "xyz"
str_c("x","y",sep=", ")
## [1] "x, y"
x <- c("abc", NA)
str_c("|-", str_replace_na(x), "-|")
## [1] "|-abc-|" "|-NA-|"
str_c("prefix-", c("a","b","c"),"-suffix")
## [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"
name <- "Hadley"
time_of_day <- "mornig"
birthday <- FALSE
str_c("Good ", time_of_day, " ", name,
if(birthday)" and HAPPY BIRTHDAY", ".")
## [1] "Good mornig Hadley."
str_c(c("x","y","z"), collapse = ", ")
## [1] "x, y, z"
x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)
## [1] "App" "Ban" "Pea"
str_sub(x, -3, -1)
## [1] "ple" "ana" "ear"
str_sub("a",1,5)
## [1] "a"
str_sub(x,1,1) <- str_to_lower(str_sub(x,1,1))
x
## [1] "apple" "banana" "pear"
str_to_upper(c("i","ı"))
## [1] "I" "I"
str_to_upper(c("i", "ı"), locale = "tr")
## [1] "İ" "I"
x <- c("apple", "eggplant","banana")
str_sort(x, locale = "en")
## [1] "apple" "banana" "eggplant"
str_sort(x, locale = "haw")
## [1] "apple" "eggplant" "banana"
x <- c("apple", "banana", "pear")
str_view(x, "an")
str_view(x, ".a.")
dot <- "\\."
writeLines(dot)
## \.
str_view(c("abc","a.c","bef"),"a\\.c")
x <- "a\\b"
writeLines(x)
## a\b
str_view(x, "\\\\")
x <- c("apple", "banana", "pear")
str_view(x, "^a")
str_view(x, "a$")
x <- c("apple pie","apple","apple cake")
str_view(x, "apple")
str_view(x, "^apple$")
str_view(c("grey","gray"),"gr(e|a)y")
x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
str_view(x, "CC?")
str_view(x, "CC+")
str_view(x, "C[LX]+")
str_view(x, "C{2}")
str_view(x, "C{2,}")
str_view(x, "C{2,3}")
str_view(x, "C{2,3}?")
str_view(x, "C[LX]+?")
str_view(fruit, "(..)\\1", match = TRUE)
x <- c("apple","banana","pear")
str_detect(x, "e")
## [1] TRUE FALSE TRUE
sum(str_detect(words, "^t"))
## [1] 65
mean(str_detect(words, "[aeiou]$"))
## [1] 0.2765306
no_vowels_1 <- !str_detect(words, "[aeiou]")
no_vowels_2 <- str_detect(words, "^[^aeiou]+$")
identical(no_vowels_1, no_vowels_2)
## [1] TRUE
words[str_detect(words,"x$")]
## [1] "box" "sex" "six" "tax"
str_subset(words, "x$")
## [1] "box" "sex" "six" "tax"
df <- tibble(
word = words,
i = seq_along(word)
)
df
## # A tibble: 980 x 2
## word i
## <chr> <int>
## 1 a 1
## 2 able 2
## 3 about 3
## 4 absolute 4
## 5 accept 5
## 6 account 6
## 7 achieve 7
## 8 across 8
## 9 act 9
## 10 active 10
## # … with 970 more rows
df %>%
filter(str_detect(words, "x$"))
## # A tibble: 4 x 2
## word i
## <chr> <int>
## 1 box 108
## 2 sex 747
## 3 six 772
## 4 tax 841
x <- c("apple", "banana", "pear")
str_count(x, "a")
## [1] 1 3 1
mean(str_count(words, "[aeiou]"))
## [1] 1.991837
df %>%
mutate(
vowels = str_count(word, "[aeiou]"),
consonants = str_count(word, "[^aeiou]")
)
## # A tibble: 980 x 4
## word i vowels consonants
## <chr> <int> <int> <int>
## 1 a 1 1 0
## 2 able 2 2 2
## 3 about 3 3 2
## 4 absolute 4 4 4
## 5 accept 5 2 4
## 6 account 6 3 4
## 7 achieve 7 4 3
## 8 across 8 2 4
## 9 act 9 1 2
## 10 active 10 3 3
## # … with 970 more rows
str_count("abababa","aba")
## [1] 2
str_view_all("abababa","aba")
length(sentences)
## [1] 720
head(sentences)
## [1] "The birch canoe slid on the smooth planks."
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."
## [4] "These days a chicken leg is a rare dish."
## [5] "Rice is often served in round bowls."
## [6] "The juice of lemons makes fine punch."
colors <- c("red", "orange", "yellow", "green", "blue", "purple")
color_match <- str_c(colors, collapse = "|")
color_match
## [1] "red|orange|yellow|green|blue|purple"
has_color <- str_subset(sentences, color_match)
matches <- str_extract(has_color, color_match)
head(matches)
## [1] "blue" "blue" "red" "red" "red" "blue"
more <- sentences[str_count(sentences, color_match)>1]
str_view_all(more, color_match)
str_extract(more, color_match)
## [1] "blue" "green" "orange"
str_extract_all(more, color_match)
## [[1]]
## [1] "blue" "red"
##
## [[2]]
## [1] "green" "red"
##
## [[3]]
## [1] "orange" "red"
str_extract_all(more, color_match, simplify = TRUE)
## [,1] [,2]
## [1,] "blue" "red"
## [2,] "green" "red"
## [3,] "orange" "red"
x <- c("a","a b","a b c")
str_extract_all(x, "[a-z]", simplify = TRUE)
## [,1] [,2] [,3]
## [1,] "a" "" ""
## [2,] "a" "b" ""
## [3,] "a" "b" "c"
noun <- "(a|the) ([^ ]+)"
has_noun <- sentences %>%
str_subset(noun) %>%
head(10)
has_noun %>%
str_extract(noun)
## [1] "the smooth" "the sheet" "the depth" "a chicken" "the parked"
## [6] "the sun" "the huge" "the ball" "the woman" "a helps"
has_noun %>%
str_match(noun)
## [,1] [,2] [,3]
## [1,] "the smooth" "the" "smooth"
## [2,] "the sheet" "the" "sheet"
## [3,] "the depth" "the" "depth"
## [4,] "a chicken" "a" "chicken"
## [5,] "the parked" "the" "parked"
## [6,] "the sun" "the" "sun"
## [7,] "the huge" "the" "huge"
## [8,] "the ball" "the" "ball"
## [9,] "the woman" "the" "woman"
## [10,] "a helps" "a" "helps"
tibble(sentence = sentences) %>%
tidyr::extract(
sentence, c("article","noun"),"(a|the) ([^ ]+)",
remove = FALSE
)
## # A tibble: 720 x 3
## sentence article noun
## <chr> <chr> <chr>
## 1 The birch canoe slid on the smooth planks. the smooth
## 2 Glue the sheet to the dark blue background. the sheet
## 3 It's easy to tell the depth of a well. the depth
## 4 These days a chicken leg is a rare dish. a chicken
## 5 Rice is often served in round bowls. <NA> <NA>
## 6 The juice of lemons makes fine punch. <NA> <NA>
## 7 The box was thrown beside the parked truck. the parked
## 8 The hogs were fed chopped corn and garbage. <NA> <NA>
## 9 Four hours of steady work faced us. <NA> <NA>
## 10 Large size in stockings is hard to sell. <NA> <NA>
## # … with 710 more rows
x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")
## [1] "-pple" "p-ar" "b-nana"
x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
## [1] "one house" "two cars" "three people"
sentences %>%
str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>%
head(5)
## [1] "The canoe birch slid on the smooth planks."
## [2] "Glue sheet the to the dark blue background."
## [3] "It's to easy tell the depth of a well."
## [4] "These a days chicken leg is a rare dish."
## [5] "Rice often is served in round bowls."
sentences %>%
head(5) %>%
str_split(" ")
## [[1]]
## [1] "The" "birch" "canoe" "slid" "on" "the" "smooth"
## [8] "planks."
##
## [[2]]
## [1] "Glue" "the" "sheet" "to" "the"
## [6] "dark" "blue" "background."
##
## [[3]]
## [1] "It's" "easy" "to" "tell" "the" "depth" "of" "a" "well."
##
## [[4]]
## [1] "These" "days" "a" "chicken" "leg" "is" "a"
## [8] "rare" "dish."
##
## [[5]]
## [1] "Rice" "is" "often" "served" "in" "round" "bowls."
"a|b|c|d" %>%
str_split("\\|") %>%
.[[1]]
## [1] "a" "b" "c" "d"
sentences %>%
head(5) %>%
str_split(" ", simplify = TRUE)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] "The" "birch" "canoe" "slid" "on" "the" "smooth"
## [2,] "Glue" "the" "sheet" "to" "the" "dark" "blue"
## [3,] "It's" "easy" "to" "tell" "the" "depth" "of"
## [4,] "These" "days" "a" "chicken" "leg" "is" "a"
## [5,] "Rice" "is" "often" "served" "in" "round" "bowls."
## [,8] [,9]
## [1,] "planks." ""
## [2,] "background." ""
## [3,] "a" "well."
## [4,] "rare" "dish."
## [5,] "" ""
fields <- c("Name: Hadley", "Country: NZ", "Age: 35")
fields %>%
str_split(": ", n = 2, simplify = TRUE)
## [,1] [,2]
## [1,] "Name" "Hadley"
## [2,] "Country" "NZ"
## [3,] "Age" "35"
x <- "This is a sentence. This is another sentence."
str_view_all(x, boundary("word"))
str_split(x, " ")[[1]]
## [1] "This" "is" "a" "sentence." "This" "is"
## [7] "another" "sentence."
str_split(x, boundary("word"))[[1]]
## [1] "This" "is" "a" "sentence" "This" "is"
## [7] "another" "sentence"
str_view(fruit, "nana")
str_view(fruit, regex("nana"))
bananas <- c("banana", "Banana", "BANANA")
str_view(bananas, "banana")
str_view(bananas, regex("banana", ignore_case = TRUE))
x <- "Line 1\nLine 2\nLine 3"
str_extract_all(x, "^Line")[[1]]
## [1] "Line"
str_extract_all(x, regex("^Line", multiline = TRUE))[[1]]
## [1] "Line" "Line" "Line"
phone <- regex("
\\(? # optional opening parens
(\\d{3}) # area code
[)- ]? # optional closing parens, dash, or space
(\\d{3}) # another three numbers
[ -]? # optional space or dash
(\\d{3}) # three more numbers
", comments = TRUE)
str_match("514-791-8141", phone)
## [,1] [,2] [,3] [,4]
## [1,] "514-791-814" "514" "791" "814"
microbenchmark::microbenchmark(
fixed = str_detect(sentences, fixed("the")),
regex = str_detect(sentences, "the"),
times = 20
)
## Unit: microseconds
## expr min lq mean median uq max neval cld
## fixed 122.745 128.0625 175.0317 149.9420 186.1430 481.032 20 a
## regex 413.930 424.1000 470.3317 435.1255 514.9605 671.236 20 b
a1 <- "\u00e1"
a2 <- "a\u0301"
c(a1, a2)
## [1] "á" "á"
a1 == a2
## [1] FALSE
str_detect(a1, fixed(a2))
## [1] FALSE
str_detect(a1, coll(a2))
## [1] TRUE
i <- c("I", "İ", "i", "ı")
i
## [1] "I" "İ" "i" "ı"
str_subset(i, coll("i", ignore_case = TRUE))
## [1] "I" "i"
str_subset(
i,
coll("i", ignore_case = TRUE, locale = "tr")
)
## [1] "İ" "i"
stringi::stri_locale_info()
## $Language
## [1] "en"
##
## $Country
## [1] "US"
##
## $Variant
## [1] ""
##
## $Name
## [1] "en_US"
x <- "This is a sentence"
str_view_all(x, boundary("word"))
str_extract_all(x, boundary("word"))
## [[1]]
## [1] "This" "is" "a" "sentence"
apropos("replace")
## [1] "%+replace%" "replace" "replace_na"
## [4] "setReplaceMethod" "str_replace" "str_replace_all"
## [7] "str_replace_na" "theme_replace"
head(dir(pattern = "\\.Rmd$"))
## [1] "R for data science 1.Rmd" "R for data science 2.Rmd"
## [3] "R for data science.Rmd" "R_for_data_science_1.Rmd"
12. Factors with forcats
library(tidyverse)
library(forcats)
x1 <- c("Dec","Apr","Jan","Mar")
x2 <- c("Dec","Apr","Jam","Mar")
sort(x1)
## [1] "Apr" "Dec" "Jan" "Mar"
month_levels <- c(
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
)
y1 <- factor(x1, levels = month_levels)
sort(y1)
## [1] Jan Mar Apr Dec
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
y2 <- factor(x2, levels = month_levels)
y2
## [1] Dec Apr <NA> Mar
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
y2 <- parse_factor(x2, levels = month_levels)
## Warning: 1 parsing failure.
## row col expected actual
## 3 -- value in level set Jam
factor(x1)
## [1] Dec Apr Jan Mar
## Levels: Apr Dec Jan Mar
f1 <- factor(x1, levels = unique(x1))
f1
## [1] Dec Apr Jan Mar
## Levels: Dec Apr Jan Mar
f2 <- x1 %>%
factor() %>%
fct_inorder()
f2
## [1] Dec Apr Jan Mar
## Levels: Dec Apr Jan Mar
levels(f2)
## [1] "Dec" "Apr" "Jan" "Mar"
gss_cat
## # A tibble: 21,483 x 9
## year marital age race rincome partyid relig denom tvhours
## <int> <fct> <int> <fct> <fct> <fct> <fct> <fct> <int>
## 1 2000 Never ma… 26 White $8000 to… Ind,near … Protes… Southe… 12
## 2 2000 Divorced 48 White $8000 to… Not str r… Protes… Baptis… NA
## 3 2000 Widowed 67 White Not appl… Independe… Protes… No den… 2
## 4 2000 Never ma… 39 White Not appl… Ind,near … Orthod… Not ap… 4
## 5 2000 Divorced 25 White Not appl… Not str d… None Not ap… 1
## 6 2000 Married 25 White $20000 -… Strong de… Protes… Southe… NA
## 7 2000 Never ma… 36 White $25000 o… Not str r… Christ… Not ap… 3
## 8 2000 Divorced 44 White $7000 to… Ind,near … Protes… Luther… NA
## 9 2000 Married 44 White $25000 o… Not str d… Protes… Other 0
## 10 2000 Married 47 White $25000 o… Strong re… Protes… Southe… 3
## # … with 21,473 more rows
gss_cat %>%
count(race)
## # A tibble: 3 x 2
## race n
## <fct> <int>
## 1 Other 1959
## 2 Black 3129
## 3 White 16395
ggplot(gss_cat, aes(race)) + geom_bar()

ggplot(gss_cat, aes(race)) + geom_bar() + scale_x_discrete(drop = FALSE)

relig <- gss_cat %>%
group_by(relig) %>%
summarize(
age = mean(age, na.rm = TRUE),
tvhours = mean(tvhours, na.rm = TRUE),
n = n()
)
ggplot(relig, aes(tvhours, relig)) + geom_point()

relig
## # A tibble: 15 x 4
## relig age tvhours n
## <fct> <dbl> <dbl> <int>
## 1 No answer 49.5 2.72 93
## 2 Don't know 35.9 4.62 15
## 3 Inter-nondenominational 40.0 2.87 109
## 4 Native american 38.9 3.46 23
## 5 Christian 40.1 2.79 689
## 6 Orthodox-christian 50.4 2.42 95
## 7 Moslem/islam 37.6 2.44 104
## 8 Other eastern 45.9 1.67 32
## 9 Hinduism 37.7 1.89 71
## 10 Buddhism 44.7 2.38 147
## 11 Other 41.0 2.73 224
## 12 None 41.2 2.71 3523
## 13 Jewish 52.4 2.52 388
## 14 Catholic 46.9 2.96 5124
## 15 Protestant 49.9 3.15 10846
ggplot(relig, aes(tvhours, fct_reorder(relig, tvhours))) + geom_point()

relig %>%
mutate(relig = fct_reorder(relig, tvhours)) %>%
ggplot(aes(tvhours, relig)) + geom_point()

rincome <- gss_cat %>%
group_by(rincome) %>%
summarize(
age = mean(age, na.rm = TRUE),
tvhours = mean(tvhours, na.rm = TRUE),
n = n()
)
ggplot(rincome, aes(age, fct_reorder(rincome, age))) + geom_point()

ggplot(rincome, aes(age, fct_relevel(rincome, "Not applicable"))) + geom_point()

by_age <- gss_cat %>%
filter(!is.na(age)) %>%
group_by(age, marital) %>%
count() %>%
mutate(prop = n/sum(n))
ggplot(by_age, aes(age, prop, color = marital)) +
geom_line(na.rm = TRUE)

ggplot(by_age, aes(age, prop, color = fct_reorder2(marital, age, prop))) +
geom_line() +
labs(color = "marital")

gss_cat %>%
mutate(marital = marital %>% fct_infreq() %>% fct_rev()) %>%
ggplot(aes(marital)) +
geom_bar()

gss_cat %>% count(partyid)
## # A tibble: 10 x 2
## partyid n
## <fct> <int>
## 1 No answer 154
## 2 Don't know 1
## 3 Other party 393
## 4 Strong republican 2314
## 5 Not str republican 3032
## 6 Ind,near rep 1791
## 7 Independent 4119
## 8 Ind,near dem 2499
## 9 Not str democrat 3690
## 10 Strong democrat 3490
gss_cat %>%
mutate(partyid = fct_recode(partyid,
"Republican, strong" = "Strong republican",
"Republican, weak" = "Not str republican",
"Independent, near rep" = "Ind,near rep",
"Independent, near dem" = "Ind,near dem",
"Democrat, weak" = "Not str democrat",
"Democrat, strong" = "Strong democrat"
)) %>%
count(partyid)
## # A tibble: 10 x 2
## partyid n
## <fct> <int>
## 1 No answer 154
## 2 Don't know 1
## 3 Other party 393
## 4 Republican, strong 2314
## 5 Republican, weak 3032
## 6 Independent, near rep 1791
## 7 Independent 4119
## 8 Independent, near dem 2499
## 9 Democrat, weak 3690
## 10 Democrat, strong 3490
gss_cat %>%
mutate(partyid = fct_recode(partyid,
"Republican, strong" = "Strong republican",
"Republican, weak" = "Not str republican",
"Independent, near rep" = "Ind,near rep",
"Independent, near dem" = "Ind,near dem",
"Democrat, weak" = "Not str democrat",
"Democrat, strong" = "Strong democrat",
"Other" = "No answer",
"Other" = "Don't know",
"Other" = "Other party"
)) %>%
count(partyid)
## # A tibble: 8 x 2
## partyid n
## <fct> <int>
## 1 Other 548
## 2 Republican, strong 2314
## 3 Republican, weak 3032
## 4 Independent, near rep 1791
## 5 Independent 4119
## 6 Independent, near dem 2499
## 7 Democrat, weak 3690
## 8 Democrat, strong 3490
gss_cat %>%
mutate(partyid = fct_collapse(partyid,
other = c("No answer", "Don't know", "Other party"),
rep = c("Strong republican", "Not str republican"),
ind = c("Ind,near rep", "Independent", "Ind,near dem"),
dem = c("Not str democrat", "Strong democrat"))) %>%
count(partyid)
## # A tibble: 4 x 2
## partyid n
## <fct> <int>
## 1 other 548
## 2 rep 5346
## 3 ind 8409
## 4 dem 7180
gss_cat %>%
mutate(relig = fct_lump(relig)) %>%
count(relig)
## # A tibble: 2 x 2
## relig n
## <fct> <int>
## 1 Protestant 10846
## 2 Other 10637
gss_cat %>%
mutate(relig = fct_lump(relig, n = 10)) %>%
count(relig, sort = TRUE) %>%
print(n = Inf)
## # A tibble: 10 x 2
## relig n
## <fct> <int>
## 1 Protestant 10846
## 2 Catholic 5124
## 3 None 3523
## 4 Christian 689
## 5 Other 458
## 6 Jewish 388
## 7 Buddhism 147
## 8 Inter-nondenominational 109
## 9 Moslem/islam 104
## 10 Orthodox-christian 95
13. Dates and times with lubridate
library(tidyverse)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:hms':
##
## hms
## The following object is masked from 'package:base':
##
## date
library(nycflights13)
today()
## [1] "2019-04-14"
now()
## [1] "2019-04-14 17:45:35 EDT"
ymd("2017-01-31")
## [1] "2017-01-31"
mdy("January 31st, 2017")
## [1] "2017-01-31"
dmy("31-Jan-2017")
## [1] "2017-01-31"
ymd(20170131)
## [1] "2017-01-31"
ymd_hms("2017-01-31 20:11:59")
## [1] "2017-01-31 20:11:59 UTC"
mdy_hm("01/31/2017 08:01")
## [1] "2017-01-31 08:01:00 UTC"
ymd(20170131, tz = "UTC")
## [1] "2017-01-31 UTC"
flights %>%
select(year, month, day, hour, minute)
## # A tibble: 336,776 x 5
## year month day hour minute
## <int> <int> <int> <dbl> <dbl>
## 1 2013 1 1 5 15
## 2 2013 1 1 5 29
## 3 2013 1 1 5 40
## 4 2013 1 1 5 45
## 5 2013 1 1 6 0
## 6 2013 1 1 5 58
## 7 2013 1 1 6 0
## 8 2013 1 1 6 0
## 9 2013 1 1 6 0
## 10 2013 1 1 6 0
## # … with 336,766 more rows
flights %>%
select(year, month, day, hour, minute) %>%
mutate(
departure = make_datetime(year, month, day, hour, minute)
)
## # A tibble: 336,776 x 6
## year month day hour minute departure
## <int> <int> <int> <dbl> <dbl> <dttm>
## 1 2013 1 1 5 15 2013-01-01 05:15:00
## 2 2013 1 1 5 29 2013-01-01 05:29:00
## 3 2013 1 1 5 40 2013-01-01 05:40:00
## 4 2013 1 1 5 45 2013-01-01 05:45:00
## 5 2013 1 1 6 0 2013-01-01 06:00:00
## 6 2013 1 1 5 58 2013-01-01 05:58:00
## 7 2013 1 1 6 0 2013-01-01 06:00:00
## 8 2013 1 1 6 0 2013-01-01 06:00:00
## 9 2013 1 1 6 0 2013-01-01 06:00:00
## 10 2013 1 1 6 0 2013-01-01 06:00:00
## # … with 336,766 more rows
make_datetime_100 <- function(year, month, day, time) {
make_datetime(year, month, day, time %/% 100, time %% 100)
}
flights_dt <- flights %>%
filter(!is.na(dep_time), !is.na(arr_time)) %>%
mutate(
dep_time = make_datetime_100(year, month, day, dep_time),
arr_time = make_datetime_100(year, month, day, arr_time),
sched_dep_time = make_datetime_100(year, month, day, sched_dep_time),
sched_arr_time = make_datetime_100(year, month, day, sched_arr_time)) %>%
select(origin, dest, ends_with("delay"), ends_with("time"))
flights_dt
## # A tibble: 328,063 x 9
## origin dest dep_delay arr_delay dep_time sched_dep_time
## <chr> <chr> <dbl> <dbl> <dttm> <dttm>
## 1 EWR IAH 2 11 2013-01-01 05:17:00 2013-01-01 05:15:00
## 2 LGA IAH 4 20 2013-01-01 05:33:00 2013-01-01 05:29:00
## 3 JFK MIA 2 33 2013-01-01 05:42:00 2013-01-01 05:40:00
## 4 JFK BQN -1 -18 2013-01-01 05:44:00 2013-01-01 05:45:00
## 5 LGA ATL -6 -25 2013-01-01 05:54:00 2013-01-01 06:00:00
## 6 EWR ORD -4 12 2013-01-01 05:54:00 2013-01-01 05:58:00
## 7 EWR FLL -5 19 2013-01-01 05:55:00 2013-01-01 06:00:00
## 8 LGA IAD -3 -14 2013-01-01 05:57:00 2013-01-01 06:00:00
## 9 JFK MCO -3 -8 2013-01-01 05:57:00 2013-01-01 06:00:00
## 10 LGA ORD -2 8 2013-01-01 05:58:00 2013-01-01 06:00:00
## # … with 328,053 more rows, and 3 more variables: arr_time <dttm>,
## # sched_arr_time <dttm>, air_time <dbl>
flights_dt %>%
ggplot(aes(dep_time)) +
geom_freqpoly(binwidth = 86400) # 86400 seconds = 1 day

flights_dt %>%
filter(dep_time < ymd(20130102)) %>%
ggplot(aes(dep_time)) +
geom_freqpoly(binwidth = 600)

as_datetime(today())
## [1] "2019-04-14 UTC"
as_date(now())
## [1] "2019-04-14"
as_datetime(60*60*10)
## [1] "1970-01-01 10:00:00 UTC"
as_date(365*10+2)
## [1] "1980-01-01"
datetime <- ymd_hms("2016-07-08 12:34:56")
year(datetime)
## [1] 2016
month(datetime)
## [1] 7
mday(datetime)
## [1] 8
yday(datetime)
## [1] 190
wday(datetime)
## [1] 6
month(datetime, label = TRUE)
## [1] Jul
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
wday(datetime, label = TRUE, abbr = FALSE)
## [1] Friday
## 7 Levels: Sunday < Monday < Tuesday < Wednesday < Thursday < ... < Saturday
flights_dt %>%
mutate(wday = wday(dep_time, label = TRUE)) %>%
ggplot(aes(x = wday)) +
geom_bar()

flights_dt %>%
mutate(minute = minute(dep_time)) %>%
group_by(minute) %>%
summarize(
avg_delay = mean(arr_delay, na.rm = TRUE),
n = n()
) %>%
ggplot(aes(minute, avg_delay)) +
geom_line()

sched_dep <- flights_dt %>%
mutate(minute = minute(sched_dep_time)) %>%
group_by(minute) %>%
summarize(
avg_delay = mean(arr_delay, na.rm = TRUE),
n = n()
)
ggplot(sched_dep, aes(minute, avg_delay)) +
geom_line()

ggplot(sched_dep, aes(minute, n)) +
geom_line()

flights_dt %>%
count(week = floor_date(dep_time, "week")) %>%
ggplot(aes(week, n)) +
geom_line()

(datetime <- ymd_hms("2016-07-08 12:34:56"))
## [1] "2016-07-08 12:34:56 UTC"
year(datetime) <- 2020
month(datetime) <- 01
hour(datetime) <- hour(datetime) + 1
datetime
## [1] "2020-01-08 13:34:56 UTC"
update(datetime, year = 2020, month = 2, mday = 2, hour = 2)
## [1] "2020-02-02 02:34:56 UTC"
ymd("2015-02-01") %>%
update(mday = 30)
## [1] "2015-03-02"
ymd("2015-02-01") %>%
update(hour = 400)
## [1] "2015-02-17 16:00:00 UTC"
flights_dt %>%
mutate(dep_hour = update(dep_time, yday = 1)) %>%
ggplot(aes(dep_hour)) +
geom_freqpoly(binwidth = 300)

h_age <- today() - ymd(19791014)
h_age
## Time difference of 14427 days
as.duration(h_age)
## [1] "1246492800s (~39.5 years)"
dseconds(15)
## [1] "15s"
dminutes(10)
## [1] "600s (~10 minutes)"
dhours(c(12,24))
## [1] "43200s (~12 hours)" "86400s (~1 days)"
ddays(0:5)
## [1] "0s" "86400s (~1 days)" "172800s (~2 days)"
## [4] "259200s (~3 days)" "345600s (~4 days)" "432000s (~5 days)"
dweeks(3)
## [1] "1814400s (~3 weeks)"
dyears(1)
## [1] "31536000s (~52.14 weeks)"
2 * dyears(1)
## [1] "63072000s (~2 years)"
dyears(1) + dweeks(12) + dhours(15)
## [1] "38847600s (~1.23 years)"
tomorrow <- today() + ddays(1)
last_year <- today() - dyears(1)
one_om <- ymd_hms("2016-03-12 13:00:00", tz = "America/New_York")
one_pm
## Error in eval(expr, envir, enclos): object 'one_pm' not found
one_pm + ddays(1)
## Error in eval(expr, envir, enclos): object 'one_pm' not found
one_pm
## Error in eval(expr, envir, enclos): object 'one_pm' not found
one_pm + days(1)
## Error in eval(expr, envir, enclos): object 'one_pm' not found
seconds(15)
## [1] "15S"
minutes(10)
## [1] "10M 0S"
hours(c(12,24))
## [1] "12H 0M 0S" "24H 0M 0S"
days(7)
## [1] "7d 0H 0M 0S"
months(1:6)
## [1] "1m 0d 0H 0M 0S" "2m 0d 0H 0M 0S" "3m 0d 0H 0M 0S" "4m 0d 0H 0M 0S"
## [5] "5m 0d 0H 0M 0S" "6m 0d 0H 0M 0S"
weeks(3)
## [1] "21d 0H 0M 0S"
years(1)
## [1] "1y 0m 0d 0H 0M 0S"
10 * (months(6) + days(1))
## [1] "60m 10d 0H 0M 0S"
days(50) + hours(25) + minutes(2)
## [1] "50d 25H 2M 0S"
ymd("2016-01-01") + dyears(1)
## [1] "2016-12-31"
ymd("2016-01-01") + years(1)
## [1] "2017-01-01"
one_pm + ddays(1)
## Error in eval(expr, envir, enclos): object 'one_pm' not found
one_pm + days(1)
## Error in eval(expr, envir, enclos): object 'one_pm' not found
flights_dt %>%
filter(arr_time < dep_time)
## # A tibble: 10,633 x 9
## origin dest dep_delay arr_delay dep_time sched_dep_time
## <chr> <chr> <dbl> <dbl> <dttm> <dttm>
## 1 EWR BQN 9 -4 2013-01-01 19:29:00 2013-01-01 19:20:00
## 2 JFK DFW 59 NA 2013-01-01 19:39:00 2013-01-01 18:40:00
## 3 EWR TPA -2 9 2013-01-01 20:58:00 2013-01-01 21:00:00
## 4 EWR SJU -6 -12 2013-01-01 21:02:00 2013-01-01 21:08:00
## 5 EWR SFO 11 -14 2013-01-01 21:08:00 2013-01-01 20:57:00
## 6 LGA FLL -10 -2 2013-01-01 21:20:00 2013-01-01 21:30:00
## 7 EWR MCO 41 43 2013-01-01 21:21:00 2013-01-01 20:40:00
## 8 JFK LAX -7 -24 2013-01-01 21:28:00 2013-01-01 21:35:00
## 9 EWR FLL 49 28 2013-01-01 21:34:00 2013-01-01 20:45:00
## 10 EWR FLL -9 -14 2013-01-01 21:36:00 2013-01-01 21:45:00
## # … with 10,623 more rows, and 3 more variables: arr_time <dttm>,
## # sched_arr_time <dttm>, air_time <dbl>
flights_dt <- flights_dt %>%
mutate(overnight = arr_time < dep_time,
arr_time = arr_time + days(overnight * 1),
sched_arr_time = sched_arr_time + days(overnight * 1))
flights_dt %>%
filter(overnight, arr_time < dep_time)
## # A tibble: 0 x 10
## # … with 10 variables: origin <chr>, dest <chr>, dep_delay <dbl>,
## # arr_delay <dbl>, dep_time <dttm>, sched_dep_time <dttm>,
## # arr_time <dttm>, sched_arr_time <dttm>, air_time <dbl>,
## # overnight <lgl>
years(1)/days(1)
## estimate only: convert to intervals for accuracy
## [1] 365.25
next_year <- today() + years(1)
(today() %--% next_year) / ddays(1)
## [1] 366
(today() %--% next_year) %/% days(1)
## [1] 366
Sys.timezone()
## [1] "America/New_York"
length(OlsonNames())
## [1] 593
head(OlsonNames())
## [1] "Africa/Abidjan" "Africa/Accra" "Africa/Addis_Ababa"
## [4] "Africa/Algiers" "Africa/Asmara" "Africa/Asmera"
(x1 <- ymd_hms("2015-06-01 12:00:00", tz = "America/New_York"))
## [1] "2015-06-01 12:00:00 EDT"
(x2 <- ymd_hms("2015-06-01 18:00:00", tz = "Europe/Copenhagen"))
## [1] "2015-06-01 18:00:00 CEST"
(x3 <- ymd_hms("2015-06-02 04:00:00", tz = "Pacific/Auckland"))
## [1] "2015-06-02 04:00:00 NZST"
x1 - x2
## Time difference of 0 secs
x1 - x3
## Time difference of 0 secs
x4 <- c(x1, x2, x3)
x4
## [1] "2015-06-01 12:00:00 EDT" "2015-06-01 12:00:00 EDT"
## [3] "2015-06-01 12:00:00 EDT"
x4a <- with_tz(x4, tzone = "Australia/Lord_Howe")
x4a
## [1] "2015-06-02 02:30:00 +1030" "2015-06-02 02:30:00 +1030"
## [3] "2015-06-02 02:30:00 +1030"
x4a-x4
## Time differences in secs
## [1] 0 0 0
x4b <- force_tz(x4, tzone = "Australia/Lord_Howe")
x4b
## [1] "2015-06-01 12:00:00 +1030" "2015-06-01 12:00:00 +1030"
## [3] "2015-06-01 12:00:00 +1030"
x4b-x4
## Time differences in hours
## [1] -14.5 -14.5 -14.5